Re: [Xen-devel] [PATCH v8 2/4] add a domain_tot_pages() helper function

2020-01-31 Thread Durrant, Paul
> -Original Message-
> From: Xen-devel  On Behalf Of
> Durrant, Paul
> Sent: 31 January 2020 15:19
> To: Jan Beulich ; Julien Grall 
> Cc: Stefano Stabellini ; Wei Liu ;
> Konrad Rzeszutek Wilk ; George Dunlap
> ; Andrew Cooper ;
> Ian Jackson ; Tim Deegan ; xen-
> de...@lists.xenproject.org; Roger Pau Monné 
> Subject: Re: [Xen-devel] [PATCH v8 2/4] add a domain_tot_pages() helper
> function
> 
> > -Original Message-
> > From: Jan Beulich 
> > Sent: 31 January 2020 12:53
> > To: Durrant, Paul 
> > Cc: xen-devel@lists.xenproject.org; Andrew Cooper
> > ; Wei Liu ; Roger Pau Monné
> > ; George Dunlap ; Ian
> > Jackson ; Julien Grall ;
> Konrad
> > Rzeszutek Wilk ; Stefano Stabellini
> > ; Tim Deegan 
> > Subject: Re: [PATCH v8 2/4] add a domain_tot_pages() helper function
> >
> > On 30.01.2020 15:57, Paul Durrant wrote:
> > > v8:
> > >  - New in v8
> > > ---
> > >  xen/arch/x86/domain.c   |  2 +-
> > >  xen/arch/x86/mm.c   |  6 +++---
> > >  xen/arch/x86/mm/p2m-pod.c   | 10 +-
> > >  xen/arch/x86/mm/shadow/common.c |  2 +-
> > >  xen/arch/x86/msi.c  |  2 +-
> > >  xen/arch/x86/numa.c |  2 +-
> > >  xen/arch/x86/pv/dom0_build.c| 25 +
> > >  xen/arch/x86/pv/domain.c|  2 +-
> > >  xen/common/domctl.c |  2 +-
> > >  xen/common/grant_table.c|  4 ++--
> > >  xen/common/keyhandler.c |  2 +-
> > >  xen/common/memory.c |  4 ++--
> > >  xen/common/page_alloc.c | 15 ---
> > >  xen/include/public/memory.h |  4 ++--
> > >  xen/include/xen/sched.h | 24 ++--
> > >  15 files changed, 60 insertions(+), 46 deletions(-)
> >
> > From this, with the comment you add next to the struct field, and
> > with your reply yesterday, what about the uses in
> > - arch/arm/arm64/domctl.c:switch_mode(),
> 
> TBH I'm not sure with that one. It looks to me like it needs to check
> whether the domain has *any* memory assigned. Perhaps checking page_list
> would be more appropriate. Perhaps Julien can comment?
> 

Having chatted to Julien, the aim of checking tot_pages here is to test whether 
the domain is newly created, in which case using domain_tot_pages() is the 
appropriate thing to do.

  Paul

___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH] libxc: migrate migration stream definitions into Xen public headers

2020-02-01 Thread Durrant, Paul
> -Original Message-
> From: David Woodhouse 
> Sent: 01 February 2020 00:40
> To: xen-devel 
> Cc: Ian Jackson ; Wei Liu ; Andrew
> Cooper ; George Dunlap
> ; Jan Beulich ; Julien
> Grall ; Konrad Rzeszutek Wilk ;
> Stefano Stabellini ; Gautam, Varad
> ; Durrant, Paul 
> Subject: [PATCH] libxc: migrate migration stream definitions into Xen
> public headers
> 
> From: David Woodhouse 
> 
> These data structures will be used for live update, which is basically
> just live migration from one Xen to the next on the same machine via
> in-memory data structures, instead of across the network.
> 
> Signed-off-by: David Woodhouse 
> Well-excellent-carry-on-then-by: Ian Jackson 
> Go-with-that-for-now-by: Andrew Cooper 

This is probably fine but may need some re-structuring when we consider 
non-cooperative migration. I have not yet had time to post a design document 
but my thoughts are along the lines of generalising the idea of 'hvm_context' 
records into 'domain_context' (or some suitable name). Then we can add the 
necessary 'PV' save records, such as event channel state, grant table state, 
etc. We may then want to consider using the same mechanism for PV shared info 
and maybe some of the other records that are currently 'top level' in the XC 
migration stream.

  Paul 

> ---
>  tools/libxc/xc_sr_common.c|  20 ++---
>  tools/libxc/xc_sr_common_x86.c|   4 +-
>  tools/libxc/xc_sr_restore.c   |   2 +-
>  tools/libxc/xc_sr_restore_x86_hvm.c   |   4 +-
>  tools/libxc/xc_sr_restore_x86_pv.c|   8 +-
>  tools/libxc/xc_sr_save.c  |   2 +-
>  tools/libxc/xc_sr_save_x86_hvm.c  |   4 +-
>  tools/libxc/xc_sr_save_x86_pv.c   |  12 +--
>  tools/libxc/xc_sr_stream_format.h |  97 +---
>  xen/include/Makefile  |   2 +-
>  xen/include/public/migration_stream.h | 125 ++
>  11 files changed, 156 insertions(+), 124 deletions(-)
>  create mode 100644 xen/include/public/migration_stream.h
> 
> diff --git a/tools/libxc/xc_sr_common.c b/tools/libxc/xc_sr_common.c
> index dd9a11b4b5..92f9332e73 100644
> --- a/tools/libxc/xc_sr_common.c
> +++ b/tools/libxc/xc_sr_common.c
> @@ -91,7 +91,7 @@ int write_split_record(struct xc_sr_context *ctx, struct
> xc_sr_record *rec,
>  int read_record(struct xc_sr_context *ctx, int fd, struct xc_sr_record
> *rec)
>  {
>  xc_interface *xch = ctx->xch;
> -struct xc_sr_rhdr rhdr;
> +struct mr_rhdr rhdr;
>  size_t datasz;
> 
>  if ( read_exact(fd, &rhdr, sizeof(rhdr)) )
> @@ -142,15 +142,15 @@ static void __attribute__((unused))
> build_assertions(void)
>  {
>  BUILD_BUG_ON(sizeof(struct xc_sr_ihdr) != 24);
>  BUILD_BUG_ON(sizeof(struct xc_sr_dhdr) != 16);
> -BUILD_BUG_ON(sizeof(struct xc_sr_rhdr) != 8);
> -
> -BUILD_BUG_ON(sizeof(struct xc_sr_rec_page_data_header)  != 8);
> -BUILD_BUG_ON(sizeof(struct xc_sr_rec_x86_pv_info)   != 8);
> -BUILD_BUG_ON(sizeof(struct xc_sr_rec_x86_pv_p2m_frames) != 8);
> -BUILD_BUG_ON(sizeof(struct xc_sr_rec_x86_pv_vcpu_hdr)   != 8);
> -BUILD_BUG_ON(sizeof(struct xc_sr_rec_x86_tsc_info)  != 24);
> -BUILD_BUG_ON(sizeof(struct xc_sr_rec_hvm_params_entry)  != 16);
> -BUILD_BUG_ON(sizeof(struct xc_sr_rec_hvm_params)!= 8);
> +BUILD_BUG_ON(sizeof(struct mr_rhdr) != 8);
> +
> +BUILD_BUG_ON(sizeof(struct mr_page_data_header)  != 8);
> +BUILD_BUG_ON(sizeof(struct mr_x86_pv_info)   != 8);
> +BUILD_BUG_ON(sizeof(struct mr_x86_pv_p2m_frames) != 8);
> +BUILD_BUG_ON(sizeof(struct mr_x86_pv_vcpu_hdr)   != 8);
> +BUILD_BUG_ON(sizeof(struct mr_x86_tsc_info)  != 24);
> +BUILD_BUG_ON(sizeof(struct mr_hvm_params_entry)  != 16);
> +BUILD_BUG_ON(sizeof(struct mr_hvm_params)!= 8);
>  }
> 
>  /*
> diff --git a/tools/libxc/xc_sr_common_x86.c
> b/tools/libxc/xc_sr_common_x86.c
> index 011684df97..1627ff72d6 100644
> --- a/tools/libxc/xc_sr_common_x86.c
> +++ b/tools/libxc/xc_sr_common_x86.c
> @@ -3,7 +3,7 @@
>  int write_x86_tsc_info(struct xc_sr_context *ctx)
>  {
>  xc_interface *xch = ctx->xch;
> -struct xc_sr_rec_x86_tsc_info tsc = {};
> +struct mr_x86_tsc_info tsc = {};
>  struct xc_sr_record rec = {
>  .type = REC_TYPE_X86_TSC_INFO,
>  .length = sizeof(tsc),
> @@ -23,7 +23,7 @@ int write_x86_tsc_info(struct xc_sr_context *ctx)
>  int handle_x86_tsc_info(struct xc_sr_context *ctx, struct xc_sr_record
> *rec)
>  {
>  xc_interface *xch = ctx->xch;
> -struct xc_sr_rec_x86_tsc_info *tsc = rec->data;
> +struct mr_x86_tsc_info *tsc = rec->data;
> 
>  if

Re: [Xen-devel] [PATCH v4 7/7] xl: allow domid to be preserved on save/restore or migrate

2020-02-01 Thread Durrant, Paul
> -Original Message-
> From: Wei Liu 
> Sent: 31 January 2020 16:08
> To: Andrew Cooper 
> Cc: Durrant, Paul ; Ian Jackson
> ; Anthony Perard ; xen-
> de...@lists.xenproject.org; Wei Liu 
> Subject: Re: [Xen-devel] [PATCH v4 7/7] xl: allow domid to be preserved on
> save/restore or migrate
> 
> On Thu, Jan 30, 2020 at 06:20:08PM +, Andrew Cooper wrote:
> > On 30/01/2020 17:42, Durrant, Paul wrote:
> > >> -Original Message-
> > >> From: Ian Jackson 
> > >> Sent: 30 January 2020 17:29
> > >> To: Durrant, Paul 
> > >> Cc: xen-devel@lists.xenproject.org; Wei Liu ; Anthony
> Perard
> > >> 
> > >> Subject: Re: [PATCH v4 7/7] xl: allow domid to be preserved on
> > >> save/restore or migrate
> > >>
> > >> Paul Durrant writes ("[PATCH v4 7/7] xl: allow domid to be preserved
> on
> > >> save/restore or migrate"):
> > >>> This patch adds a '-D' command line option to save and migrate to
> allow
> > >>> the domain id to be incorporated into the saved domain configuration
> and
> > >>> hence be preserved.
> > >> Thanks.
> > >>
> > >> In response to v3 6/6 I wrote:
> > >>
> > >>   I wonder if this should be done more in libxl.  Should this be a
> > >>   domain property ?  Wei, Anthony ?
> > >>
> > >> This question remains unanswered.  I'm sorry that neither Wei nor
> > >> Anthony had a chance to answer yet...
> > >>
> > >> Paul, do you have a reason for doing it this way ?  My inclination is
> > >> that think doing it at the libxl layer would make more sense.  Do you
> > >> think that would be possible ?
> > >>
> > > I'm not sure how it would work at the libxl level as the domid is
> > > part of create_info and that it transferred by xl on migration. IIUC
> > > we'd need a new libxl save record to transfer it at that level, and
> > > then I'm not sure whether we'd hit an ordering problem as we'd have
> > > to dig that save record out before we could actually create the
> > > domain.
> >
> > There is definitely an ordering problem.
> >
> > I agree that it should logically be part of the libxl level of the
> > stream, but none of that is even parsed until after the domain has been
> > created on the destination side.
> >
> > I have no idea how easy/difficult it would be to rearrange libxl to
> > start parsing the migration stream before creating the domain.  I think
> > there will be a lot of code relying on the domid already being valid.
> 
> This.
> 
> The other way I can think of is to specify something (domid policy??) in
> create_info and apply it during domain creation. But that reeks like a
> layering violation to me.
> 

That's basically what this series does, but I don't see it as a layering 
violation. It has always been the case that xl is in control of the domain 
creation and then passes the migration stream into libxl. Passing in a 'domid 
policy' (specific value, 'random', or 'allocated by Xen') as part of 
create_info, and not messing with the libxl migration data, seems like the 
right approach to me... which is why I've done it that way.

  Paul

___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v5 5/7] libxl: allow creation of domains with a specified or random domid

2020-02-02 Thread Durrant, Paul
> -Original Message-
> From: jandr...@gmail.com 
> Sent: 31 January 2020 17:23
> To: Durrant, Paul 
> Cc: xen-devel ; Ian Jackson
> ; Wei Liu ; Anthony PERARD
> ; Andrew Cooper ;
> George Dunlap ; Jan Beulich
> ; Julien Grall ; Konrad Rzeszutek Wilk
> ; Stefano Stabellini 
> Subject: Re: [PATCH v5 5/7] libxl: allow creation of domains with a
> specified or random domid
> 
> On Fri, Jan 31, 2020 at 10:02 AM Paul Durrant  wrote:
> >
> > This patch adds a 'domid' field to libxl_domain_create_info and then
> > modifies libxl__domain_make() to have Xen use that value if it is valid.
> > If the domid value is invalid then Xen will choose the domid, as before,
> > unless the value is the new special RANDOM_DOMID value added to the API.
> > This value instructs libxl__domain_make() to choose a random domid value
> > for Xen to use.
> >
> > If Xen determines that a domid specified to or chosen by
> > libxl__domain_make() co-incides with an existing domain then the create
> > operation will fail. In this case, if RANDOM_DOMID was specified to
> > libxl__domain_make() then a new random value will be chosen and the
> create
> > operation will be re-tried, otherwise libxl__domain_make() will fail.
> >
> > After Xen has successfully created a new domain, libxl__domain_make()
> will
> > check whether its domid matches any recently used domid values. If it
> does
> > then the domain will be destroyed. If the domid used in creation was
> > specified to libxl__domain_make() then it will fail at this point,
> > otherwise the create operation will be re-tried with either a new random
> > or Xen-selected domid value.
> >
> > NOTE: libxl__logv() is also modified to only log valid domid values in
> >   messages rather than any domid, valid or otherwise, that is not
> >   INVALID_DOMID.
> >
> > Signed-off-by: Paul Durrant 
> 
> Looks good, with one suggestion below.
> 
> Reviewed-by: Jason Andryuk 
> 

Thanks.

> 
> 
> > +
> > +/* Try to destroy the domain again as we can't use it */
> > +ret = xc_domain_destroy(ctx->xch, *domid);
> > +if (ret < 0) {
> > +LOGED(ERROR, *domid, "domain destroy fail");
> 
> Maybe "destroy recently used domain id failed"?
> 

Probably doesn't actually matter. A destroy failure during domain creation 
would be an unusual circumstance, but if I have to post a v6 I'll change the 
message while I'm at it.

  Paul

> > +*domid = INVALID_DOMID;
> > +rc = ERROR_FAIL;
> > +goto out;
> > +}
> >  }
> >
> >  rc = libxl__arch_domain_save_config(gc, d_config, state,
> &create);
___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v6 05/11] x86/hyperv: setup hypercall page

2020-02-03 Thread Durrant, Paul
> -Original Message-
> From: Wei Liu 
> Sent: 31 January 2020 17:57
> To: Xen Development List 
> Cc: Durrant, Paul ; Michael Kelley
> ; Wei Liu ; Wei Liu
> ; Jan Beulich ; Andrew Cooper
> ; Roger Pau Monné 
> Subject: Re: [PATCH v6 05/11] x86/hyperv: setup hypercall page
> 
> (Note to self)
> 
> On Fri, Jan 31, 2020 at 05:49:24PM +, Wei Liu wrote:
> [...]
> > +static uint64_t generate_guest_id(void)
> > +{
> > +union hv_guest_os_id id;
> > +
> 
>id.raw = 0;

Or just use a C99 initializer to set things up. A bit neater IMO.

  Paul

> 
> > +id.vendor = HV_XEN_VENDOR_ID;
> > +id.major = xen_major_version();
> > +id.minor = xen_minor_version();
> > +
> > +return id.raw;
> > +}

___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v6 05/11] x86/hyperv: setup hypercall page

2020-02-03 Thread Durrant, Paul
> -Original Message-
> From: Jan Beulich 
> Sent: 03 February 2020 11:06
> To: Durrant, Paul 
> Cc: Wei Liu ; Xen Development List  de...@lists.xenproject.org>; Michael Kelley ; Wei
> Liu ; Andrew Cooper ;
> Roger Pau Monné 
> Subject: Re: [PATCH v6 05/11] x86/hyperv: setup hypercall page
> 
> On 03.02.2020 11:39, Durrant, Paul wrote:
> >> -Original Message-
> >> From: Wei Liu 
> >> Sent: 31 January 2020 17:57
> >> To: Xen Development List 
> >> Cc: Durrant, Paul ; Michael Kelley
> >> ; Wei Liu ; Wei Liu
> >> ; Jan Beulich ; Andrew Cooper
> >> ; Roger Pau Monné 
> >> Subject: Re: [PATCH v6 05/11] x86/hyperv: setup hypercall page
> >>
> >> (Note to self)
> >>
> >> On Fri, Jan 31, 2020 at 05:49:24PM +, Wei Liu wrote:
> >> [...]
> >>> +static uint64_t generate_guest_id(void)
> >>> +{
> >>> +union hv_guest_os_id id;
> >>> +
> >>
> >>id.raw = 0;
> >
> > Or just use a C99 initializer to set things up. A bit neater IMO.
> 
> If you mean this for ...
> 
> >>> +id.vendor = HV_XEN_VENDOR_ID;
> >>> +id.major = xen_major_version();
> >>> +id.minor = xen_minor_version();
> 
> ... these three fields, then this won't work with rather old gcc
> we still permit to be used. Using { .raw = 0 } would work afaict.
> 

Not even { .vendor = HV_XEN_VENDOR_ID } ?

  Paul

> Jan
___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v6 10/11] x86: move viridian_page_msr to hyperv-tlfs.h

2020-02-03 Thread Durrant, Paul
> -Original Message-
> From: Wei Liu  On Behalf Of Wei Liu
> Sent: 31 January 2020 17:49
> To: Xen Development List 
> Cc: Durrant, Paul ; Michael Kelley
> ; Wei Liu ; Wei Liu
> ; Jan Beulich ; Andrew Cooper
> ; Roger Pau Monné 
> Subject: [PATCH v6 10/11] x86: move viridian_page_msr to hyperv-tlfs.h
> 
> And rename it to hv_vp_assist_page_msr.
> 
> No functional change.
> 
> Signed-off-by: Wei Liu 

Reviewed-by: Paul Durrant 

> ---
>  xen/arch/x86/hvm/viridian/viridian.c|  2 +-
>  xen/include/asm-x86/guest/hyperv-tlfs.h | 11 +++
>  xen/include/asm-x86/hvm/viridian.h  | 15 ++-
>  3 files changed, 14 insertions(+), 14 deletions(-)
> 
> diff --git a/xen/arch/x86/hvm/viridian/viridian.c
> b/xen/arch/x86/hvm/viridian/viridian.c
> index 44c8e6cac6..9a6cafcb62 100644
> --- a/xen/arch/x86/hvm/viridian/viridian.c
> +++ b/xen/arch/x86/hvm/viridian/viridian.c
> @@ -230,7 +230,7 @@ static void dump_guest_os_id(const struct domain *d)
> 
>  static void dump_hypercall(const struct domain *d)
>  {
> -const union viridian_page_msr *hg;
> +const union hv_vp_assist_page_msr *hg;
> 
>  hg = &d->arch.hvm.viridian->hypercall_gpa;
> 
> diff --git a/xen/include/asm-x86/guest/hyperv-tlfs.h b/xen/include/asm-
> x86/guest/hyperv-tlfs.h
> index 07db57b55f..0a0f3398c1 100644
> --- a/xen/include/asm-x86/guest/hyperv-tlfs.h
> +++ b/xen/include/asm-x86/guest/hyperv-tlfs.h
> @@ -558,6 +558,17 @@ struct hv_nested_enlightenments_control {
>   } hypercallControls;
>  };
> 
> +union hv_vp_assist_page_msr
> +{
> +uint64_t raw;
> +struct
> +{
> +uint64_t enabled:1;
> +uint64_t reserved_preserved:11;
> +uint64_t pfn:48;
> +};
> +};
> +
>  /* Define virtual processor assist page structure. */
>  struct hv_vp_assist_page {
>   __u32 apic_assist;
> diff --git a/xen/include/asm-x86/hvm/viridian.h b/xen/include/asm-
> x86/hvm/viridian.h
> index d9138562e6..844e56b38f 100644
> --- a/xen/include/asm-x86/hvm/viridian.h
> +++ b/xen/include/asm-x86/hvm/viridian.h
> @@ -11,20 +11,9 @@
> 
>  #include 
> 
> -union viridian_page_msr
> -{
> -uint64_t raw;
> -struct
> -{
> -uint64_t enabled:1;
> -uint64_t reserved_preserved:11;
> -uint64_t pfn:48;
> -};
> -};
> -
>  struct viridian_page
>  {
> -union viridian_page_msr msr;
> +union hv_vp_assist_page_msr msr;
>  void *ptr;
>  };
> 
> @@ -70,7 +59,7 @@ struct viridian_time_ref_count
>  struct viridian_domain
>  {
>  union hv_guest_os_id guest_os_id;
> -union viridian_page_msr hypercall_gpa;
> +union hv_vp_assist_page_msr hypercall_gpa;
>  struct viridian_time_ref_count time_ref_count;
>  struct viridian_page reference_tsc;
>  };
> --
> 2.20.1


___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v6 11/11] x86/hyperv: setup VP assist page

2020-02-03 Thread Durrant, Paul
> -Original Message-
> From: Wei Liu  On Behalf Of Wei Liu
> Sent: 31 January 2020 17:50
> To: Xen Development List 
> Cc: Durrant, Paul ; Michael Kelley
> ; Wei Liu ; Wei Liu
> ; Jan Beulich ; Andrew Cooper
> ; Roger Pau Monné 
> Subject: [PATCH v6 11/11] x86/hyperv: setup VP assist page
> 
> VP assist page is rather important as we need to toggle some bits in it
> for efficient nested virtualisation.
> 
> Signed-off-by: Wei Liu 

Reviewed-by: Paul Durrant 

> ---
> v6:
> 1. Use hv_vp_assist_page_msr
> 2. Make code shorter
> 3. Preserve rsvdP fields
> 
> v5:
> 1. Deal with error properly instead of always panicking
> 2. Swap percpu variables declarations' location
> 
> v4:
> 1. Use private.h
> 2. Prevent leak
> 
> v3:
> 1. Use xenheap page
> 2. Drop set_vp_assist
> 
> v2:
> 1. Use HV_HYP_PAGE_SHIFT instead
> ---
>  xen/arch/x86/guest/hyperv/hyperv.c  | 37 -
>  xen/arch/x86/guest/hyperv/private.h |  1 +
>  2 files changed, 37 insertions(+), 1 deletion(-)
> 
> diff --git a/xen/arch/x86/guest/hyperv/hyperv.c
> b/xen/arch/x86/guest/hyperv/hyperv.c
> index 6dac3bfceb..75fb329d4d 100644
> --- a/xen/arch/x86/guest/hyperv/hyperv.c
> +++ b/xen/arch/x86/guest/hyperv/hyperv.c
> @@ -31,6 +31,7 @@
> 
>  struct ms_hyperv_info __read_mostly ms_hyperv;
>  DEFINE_PER_CPU_READ_MOSTLY(void *, hv_input_page);
> +DEFINE_PER_CPU_READ_MOSTLY(void *, hv_vp_assist);
>  DEFINE_PER_CPU_READ_MOSTLY(unsigned int, hv_vp_index);
> 
>  static uint64_t generate_guest_id(void)
> @@ -155,17 +156,51 @@ static int setup_hypercall_pcpu_arg(void)
>  return 0;
>  }
> 
> +static int setup_vp_assist(void)
> +{
> +union hv_vp_assist_page_msr msr;
> +
> +if ( !this_cpu(hv_vp_assist) )
> +{
> +this_cpu(hv_vp_assist) = alloc_xenheap_page();
> +if ( !this_cpu(hv_vp_assist) )
> +{
> +printk("CPU%u: Failed to allocate vp_assist page\n",
> +   smp_processor_id());
> +return -ENOMEM;
> +}
> +
> +clear_page(this_cpu(hv_vp_assist));
> +}
> +
> +rdmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.raw);
> +msr.pfn = virt_to_mfn(this_cpu(hv_vp_assist));
> +msr.enabled = 1;
> +wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.raw);
> +
> +return 0;
> +}
> +
>  static void __init setup(void)
>  {
>  setup_hypercall_page();
> 
>  if ( setup_hypercall_pcpu_arg() )
>  panic("Hyper-V hypercall percpu arg setup failed\n");
> +
> +if ( setup_vp_assist() )
> +panic("VP assist page setup failed\n");
>  }
> 
>  static int ap_setup(void)
>  {
> -return setup_hypercall_pcpu_arg();
> +int rc;
> +
> +rc = setup_hypercall_pcpu_arg();
> +if ( rc )
> +return rc;
> +
> +return setup_vp_assist();
>  }
> 
>  static void __init e820_fixup(struct e820map *e820)
> diff --git a/xen/arch/x86/guest/hyperv/private.h
> b/xen/arch/x86/guest/hyperv/private.h
> index d1765d4f23..956eff831f 100644
> --- a/xen/arch/x86/guest/hyperv/private.h
> +++ b/xen/arch/x86/guest/hyperv/private.h
> @@ -25,6 +25,7 @@
>  #include 
> 
>  DECLARE_PER_CPU(void *, hv_input_page);
> +DECLARE_PER_CPU(void *, hv_vp_assist);
>  DECLARE_PER_CPU(unsigned int, hv_vp_index);
> 
>  #endif /* __XEN_HYPERV_PRIVIATE_H__  */
> --
> 2.20.1


___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v6 05/11] x86/hyperv: setup hypercall page

2020-02-03 Thread Durrant, Paul
> -Original Message-
> From: Jan Beulich 
> Sent: 03 February 2020 11:34
> To: Durrant, Paul 
> Cc: Wei Liu ; Xen Development List  de...@lists.xenproject.org>; Michael Kelley ; Wei
> Liu ; Andrew Cooper ;
> Roger Pau Monné 
> Subject: Re: [PATCH v6 05/11] x86/hyperv: setup hypercall page
> 
> On 03.02.2020 12:21, Durrant, Paul wrote:
> >> -Original Message-
> >> From: Jan Beulich 
> >> Sent: 03 February 2020 11:06
> >> To: Durrant, Paul 
> >> Cc: Wei Liu ; Xen Development List  >> de...@lists.xenproject.org>; Michael Kelley ;
> Wei
> >> Liu ; Andrew Cooper ;
> >> Roger Pau Monné 
> >> Subject: Re: [PATCH v6 05/11] x86/hyperv: setup hypercall page
> >>
> >> On 03.02.2020 11:39, Durrant, Paul wrote:
> >>>> -Original Message-
> >>>> From: Wei Liu 
> >>>> Sent: 31 January 2020 17:57
> >>>> To: Xen Development List 
> >>>> Cc: Durrant, Paul ; Michael Kelley
> >>>> ; Wei Liu ; Wei Liu
> >>>> ; Jan Beulich ; Andrew Cooper
> >>>> ; Roger Pau Monné 
> >>>> Subject: Re: [PATCH v6 05/11] x86/hyperv: setup hypercall page
> >>>>
> >>>> (Note to self)
> >>>>
> >>>> On Fri, Jan 31, 2020 at 05:49:24PM +, Wei Liu wrote:
> >>>> [...]
> >>>>> +static uint64_t generate_guest_id(void)
> >>>>> +{
> >>>>> +union hv_guest_os_id id;
> >>>>> +
> >>>>
> >>>>id.raw = 0;
> >>>
> >>> Or just use a C99 initializer to set things up. A bit neater IMO.
> >>
> >> If you mean this for ...
> >>
> >>>>> +id.vendor = HV_XEN_VENDOR_ID;
> >>>>> +id.major = xen_major_version();
> >>>>> +id.minor = xen_minor_version();
> >>
> >> ... these three fields, then this won't work with rather old gcc
> >> we still permit to be used. Using { .raw = 0 } would work afaict.
> >>
> >
> > Not even { .vendor = HV_XEN_VENDOR_ID } ?
> 
> No, because of it being part of an unnamed (struct) member of
> the union.

Ok... shame. Presumably an empty initializer - {} -  would be ok?

  Paul

> 
> Jan
___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v6 05/11] x86/hyperv: setup hypercall page

2020-02-03 Thread Durrant, Paul
> -Original Message-
> From: Jan Beulich 
> Sent: 03 February 2020 11:39
> To: Durrant, Paul 
> Cc: Wei Liu ; Xen Development List  de...@lists.xenproject.org>; Michael Kelley ; Wei
> Liu ; Andrew Cooper ;
> Roger Pau Monné 
> Subject: Re: [PATCH v6 05/11] x86/hyperv: setup hypercall page
> 
> On 03.02.2020 12:37, Durrant, Paul wrote:
> >> -Original Message-
> >> From: Jan Beulich 
> >> Sent: 03 February 2020 11:34
> >> To: Durrant, Paul 
> >> Cc: Wei Liu ; Xen Development List  >> de...@lists.xenproject.org>; Michael Kelley ;
> Wei
> >> Liu ; Andrew Cooper ;
> >> Roger Pau Monné 
> >> Subject: Re: [PATCH v6 05/11] x86/hyperv: setup hypercall page
> >>
> >> On 03.02.2020 12:21, Durrant, Paul wrote:
> >>>> -Original Message-
> >>>> From: Jan Beulich 
> >>>> Sent: 03 February 2020 11:06
> >>>> To: Durrant, Paul 
> >>>> Cc: Wei Liu ; Xen Development List  >>>> de...@lists.xenproject.org>; Michael Kelley ;
> >> Wei
> >>>> Liu ; Andrew Cooper ;
> >>>> Roger Pau Monné 
> >>>> Subject: Re: [PATCH v6 05/11] x86/hyperv: setup hypercall page
> >>>>
> >>>> On 03.02.2020 11:39, Durrant, Paul wrote:
> >>>>>> -Original Message-
> >>>>>> From: Wei Liu 
> >>>>>> Sent: 31 January 2020 17:57
> >>>>>> To: Xen Development List 
> >>>>>> Cc: Durrant, Paul ; Michael Kelley
> >>>>>> ; Wei Liu ; Wei Liu
> >>>>>> ; Jan Beulich ; Andrew Cooper
> >>>>>> ; Roger Pau Monné 
> >>>>>> Subject: Re: [PATCH v6 05/11] x86/hyperv: setup hypercall page
> >>>>>>
> >>>>>> (Note to self)
> >>>>>>
> >>>>>> On Fri, Jan 31, 2020 at 05:49:24PM +, Wei Liu wrote:
> >>>>>> [...]
> >>>>>>> +static uint64_t generate_guest_id(void)
> >>>>>>> +{
> >>>>>>> +union hv_guest_os_id id;
> >>>>>>> +
> >>>>>>
> >>>>>>id.raw = 0;
> >>>>>
> >>>>> Or just use a C99 initializer to set things up. A bit neater IMO.
> >>>>
> >>>> If you mean this for ...
> >>>>
> >>>>>>> +id.vendor = HV_XEN_VENDOR_ID;
> >>>>>>> +id.major = xen_major_version();
> >>>>>>> +id.minor = xen_minor_version();
> >>>>
> >>>> ... these three fields, then this won't work with rather old gcc
> >>>> we still permit to be used. Using { .raw = 0 } would work afaict.
> >>>>
> >>>
> >>> Not even { .vendor = HV_XEN_VENDOR_ID } ?
> >>
> >> No, because of it being part of an unnamed (struct) member of
> >> the union.
> >
> > Ok... shame. Presumably an empty initializer - {} -  would be ok?
> 
> I think so, yes. I understand you'd like this better than
> { .raw = 0 } ?
> 

Yes. In general, using a c99 initializer to explicitly set something to 0 seems 
a bit redundant to me.

  Paul

> Jan
___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v4 1/7] SVM: drop asm/hvm/emulate.h inclusion from vmcb.h

2020-02-03 Thread Durrant, Paul
> -Original Message-
> From: Xen-devel  On Behalf Of Jan
> Beulich
> Sent: 31 January 2020 16:42
> To: xen-devel@lists.xenproject.org
> Cc: Petre Pircalabu ; Kevin Tian
> ; Tamas K Lengyel ; Wei Liu
> ; Paul Durrant ; George Dunlap
> ; Andrew Cooper ;
> Tim Deegan ; Jun Nakajima ; Alexandru
> Isaila ; Roger Pau Monné 
> Subject: [Xen-devel] [PATCH v4 1/7] SVM: drop asm/hvm/emulate.h inclusion
> from vmcb.h
> 
> It's not needed there and introduces a needless, almost global
> dependency. Include the file (or in some cases just xen/err.h) where
> actually needed, or - in one case - simply forward-declare a struct. In
> microcode*.c take the opportunity and also re-order a few other
> #include-s.
> 
> Signed-off-by: Jan Beulich 

Reviewed-by: Paul Durrant 

> ---
> v4: New.
> 
> --- a/xen/arch/x86/hvm/hvm.c
> +++ b/xen/arch/x86/hvm/hvm.c
> @@ -55,6 +55,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> --- a/xen/arch/x86/hvm/ioreq.c
> +++ b/xen/arch/x86/hvm/ioreq.c
> @@ -28,6 +28,7 @@
>  #include 
>  #include 
> 
> +#include 
>  #include 
>  #include 
>  #include 
> --- a/xen/arch/x86/hvm/svm/emulate.c
> +++ b/xen/arch/x86/hvm/svm/emulate.c
> @@ -20,6 +20,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> --- a/xen/arch/x86/hvm/vm_event.c
> +++ b/xen/arch/x86/hvm/vm_event.c
> @@ -22,6 +22,7 @@
> 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
> 
> --- a/xen/arch/x86/microcode.c
> +++ b/xen/arch/x86/microcode.c
> @@ -22,9 +22,10 @@
>   */
> 
>  #include 
> -#include 
> -#include 
> +#include 
>  #include 
> +#include 
> +#include 
>  #include 
>  #include 
>  #include 
> --- a/xen/arch/x86/microcode_amd.c
> +++ b/xen/arch/x86/microcode_amd.c
> @@ -14,9 +14,10 @@
>   *  License version 2. See file COPYING for details.
>   */
> 
> -#include 
> -#include 
> +#include 
>  #include 
> +#include 
> +#include 
>  #include 
>  #include 
>  #include 
> --- a/xen/arch/x86/microcode_intel.c
> +++ b/xen/arch/x86/microcode_intel.c
> @@ -21,9 +21,10 @@
>   * 2 of the License, or (at your option) any later version.
>   */
> 
> -#include 
> -#include 
> +#include 
>  #include 
> +#include 
> +#include 
>  #include 
>  #include 
>  #include 
> --- a/xen/arch/x86/mm/shadow/hvm.c
> +++ b/xen/arch/x86/mm/shadow/hvm.c
> @@ -28,6 +28,7 @@
>  #include 
> 
>  #include 
> +#include 
>  #include 
> 
>  #include "private.h"
> --- a/xen/arch/x86/pv/emul-gate-op.c
> +++ b/xen/arch/x86/pv/emul-gate-op.c
> @@ -19,6 +19,7 @@
>   * along with this program; If not, see .
>   */
> 
> +#include 
>  #include 
>  #include 
>  #include 
> --- a/xen/include/asm-x86/hvm/svm/vmcb.h
> +++ b/xen/include/asm-x86/hvm/svm/vmcb.h
> @@ -20,8 +20,6 @@
>  #define __ASM_X86_HVM_SVM_VMCB_H__
> 
>  #include 
> -#include 
> -
> 
>  /* general 1 intercepts */
>  enum GenericIntercept1bits
> --- a/xen/include/asm-x86/hvm/vmx/vmx.h
> +++ b/xen/include/asm-x86/hvm/vmx/vmx.h
> @@ -97,6 +97,7 @@ void vmx_asm_do_vmentry(void);
>  void vmx_intr_assist(void);
>  void noreturn vmx_do_resume(struct vcpu *);
>  void vmx_vlapic_msr_changed(struct vcpu *v);
> +struct hvm_emulate_ctxt;
>  void vmx_realmode_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt);
>  void vmx_realmode(struct cpu_user_regs *regs);
>  void vmx_update_debug_state(struct vcpu *v);
> 
> 
> ___
> Xen-devel mailing list
> Xen-devel@lists.xenproject.org
> https://lists.xenproject.org/mailman/listinfo/xen-devel
___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] XEN Qdisk Ceph rbd support broken?

2020-02-04 Thread Durrant, Paul
De-htmling...

---
From: Xen-devel  On Behalf Of Jules
Sent: 03 February 2020 17:35
To: xen-devel@lists.xenproject.org
Cc: oleksandr_gryt...@epam.com; w...@xen.org
Subject: [Xen-devel] XEN Qdisk Ceph rbd support broken?

Hey,

I don’t know if it was this or a previous change in qdisk driver, but can it be 
that remote Ceph RBD support is broken?
https://github.com/xen-project/xen/commit/8f486344a00652ed202ade43c02c96771812bf8c

Remote network Ceph image works fine with Xen 4.12.x with a config syntax like 
this:
disk = [ 'format=raw, vdev=xvda1, access=rw,backendtype=qdisk, 
target=rbd:/:id=‘ ]

In Xen 4.13.0 which I have tested recently it blames with the error message „no 
such file or directory“ as it would try accessing the image over filesystem 
instead of remote network image.
---

I doubt the issue is in xl/libxl; sounds more likely to be in QEMU. The PV 
block backend infrastructure in QEMU was changed between the 4.12 and 4.13 
releases. Have you tried using an older QEMU with 4.13?

  Paul
___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v2 2/2] xen/x86: hap: Clean-up and harden hap_enable()

2020-02-04 Thread Durrant, Paul
> -Original Message-
> From: Xen-devel  On Behalf Of
> Julien Grall
> Sent: 04 February 2020 13:06
> To: xen-devel@lists.xenproject.org
> Cc: jul...@xen.org; Wei Liu ; George Dunlap
> ; Andrew Cooper ;
> Grall, Julien ; Jan Beulich ; Roger
> Pau Monné 
> Subject: [Xen-devel] [PATCH v2 2/2] xen/x86: hap: Clean-up and harden
> hap_enable()
> 
> From: Julien Grall 
> 
> Unlike shadow_enable(), hap_enable() can only be called once during
> domain creation and with the mode equal to mode equal to
> PG_external | PG_translate | PG_refcounts.
> 
> If it were called twice, then we might have something interesting
> problem as the p2m tables would be re-allocated (and therefore all the
> mappings would be lost).

There are two tests in p2m_alloc_tabl2: whether the domain has memory 
allocated, and whether the domain already has a p2m. Can these now be dropped?

  Paul

> 
> Add code to sanity check the mode and that the function is only called
> once. Take the opportunity to an if checking that PG_translate is set.
> 
> Signed-off-by: Julien Grall 
> 
> ---
> 
> It is not entirely clear when PG_translate was enforced.
> 
> I keep the check != 0 because this is consistent with the rest of the
> file. If we want to omit comparison against 0, then this should be in a
> separate patches converting the file.
> 
> Changes in v2:
> - Fix typoes in the commit message
> - Use -EEXIST instead of -EINVAL
> ---
>  xen/arch/x86/mm/hap/hap.c | 18 +++---
>  1 file changed, 11 insertions(+), 7 deletions(-)
> 
> diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c
> index 31362a31b6..4974bd13d4 100644
> --- a/xen/arch/x86/mm/hap/hap.c
> +++ b/xen/arch/x86/mm/hap/hap.c
> @@ -445,6 +445,13 @@ int hap_enable(struct domain *d, u32 mode)
>  unsigned int i;
>  int rv = 0;
> 
> +if ( mode != (PG_external | PG_translate | PG_refcounts) )
> +return -EINVAL;
> +
> +/* The function can only be called once */
> +if ( d->arch.paging.mode != 0 )
> +return -EEXIST;
> +
>  domain_pause(d);
> 
>  old_pages = d->arch.paging.hap.total_pages;
> @@ -465,13 +472,10 @@ int hap_enable(struct domain *d, u32 mode)
>  d->arch.paging.alloc_page = hap_alloc_p2m_page;
>  d->arch.paging.free_page = hap_free_p2m_page;
> 
> -/* allocate P2m table */
> -if ( mode & PG_translate )
> -{
> -rv = p2m_alloc_table(p2m_get_hostp2m(d));
> -if ( rv != 0 )
> -goto out;
> -}
> +/* allocate P2M table */
> +rv = p2m_alloc_table(p2m_get_hostp2m(d));
> +if ( rv != 0 )
> +goto out;
> 
>  for ( i = 0; i < MAX_NESTEDP2M; i++ )
>  {
> --
> 2.17.1
> 
> 
> ___
> Xen-devel mailing list
> Xen-devel@lists.xenproject.org
> https://lists.xenproject.org/mailman/listinfo/xen-devel
___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH] libxl: fix assertion failure in stub domain creation

2020-02-05 Thread Durrant, Paul
> -Original Message-
> From: Anthony PERARD 
> Sent: 05 February 2020 10:47
> To: Durrant, Paul 
> Cc: xen-devel@lists.xenproject.org; Ian Jackson
> ; Wei Liu ; Roger Pau Monné
> 
> Subject: Re: [PATCH] libxl: fix assertion failure in stub domain creation
> 
> On Wed, Feb 05, 2020 at 09:37:24AM +, Paul Durrant wrote:
> > An assertion in libxl__domain_make():
> >
> > 'soft_reset || *domid == INVALID_DOMID'
> >
> > does not hold true for stub domain creation, where soft_reset is false
> > but the passed in domid == 0. This is easily fixed by changing the
> > initializer in libxl__spawn_stub_dm().
> >
> > Fixes: 75259239d85d ("libxl_create: make 'soft reset' explicit")
> > Signed-off-by: Paul Durrant 
> > ---
> >  tools/libxl/libxl_dm.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/tools/libxl/libxl_dm.c b/tools/libxl/libxl_dm.c
> > index f758daf3b6..3b1da90167 100644
> > --- a/tools/libxl/libxl_dm.c
> > +++ b/tools/libxl/libxl_dm.c
> > @@ -2127,7 +2127,7 @@ void libxl__spawn_stub_dm(libxl__egc *egc,
> libxl__stub_dm_spawn_state *sdss)
> >  goto out;
> >  }
> >
> > -sdss->pvqemu.guest_domid = 0;
> > +sdss->pvqemu.guest_domid = INVALID_DOMID;
> 
> How this works? INVALID_DOMID seems to be directly feed to
> xc_domain_create(), which is using XEN_DOMCTL_createdomain.
> But a comment in domctl.h for XEN_DOMCTL_createdomain read:
> NB. xen_domctl.domain is an IN/OUT parameter for this operation.
> If it is specified as zero, an id is auto-allocated and returned.
> So, is xc_domain_create going to create a new domain with
> domid==INVALID_DOMID?
> 

As it happens, no. That comment is wrong. It should read "If it is not set to a 
valid value, an id is auto-allocated and returned".

  Paul


___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v7 05/10] x86/hyperv: setup hypercall page

2020-02-05 Thread Durrant, Paul
> -Original Message-
> From: Wei Liu  On Behalf Of Wei Liu
> Sent: 04 February 2020 15:37
> To: Xen Development List 
> Cc: Durrant, Paul ; Michael Kelley
> ; Wei Liu ; Wei Liu
> ; Jan Beulich ; Andrew Cooper
> ; Roger Pau Monné 
> Subject: [PATCH v7 05/10] x86/hyperv: setup hypercall page
> 
> Hyper-V uses a technique called overlay page for its hypercall page. It
> will insert a backing page to the guest when the hypercall functionality
> is enabled. That means we can use a page that is not backed by real
> memory for hypercall page.
> 
> To avoid shattering L0 superpages and treading on any MMIO areas
> residing in low addresses, use the top-most addressable page for that
> purpose. Adjust e820 map accordingly.
> 
> We also need to register Xen's guest OS ID to Hyper-V. Use 0x3 as the
> vendor ID. Fix the comment in hyperv-tlfs.h while at it.
> 
> Signed-off-by: Wei Liu 

Acked-by: Paul Durrant 

> ---
> v7:
> 1. Fix a style issue
> 2. Initialise ID to 0
> 3. Update commit message
> 
> v6:
> 1. Use hv_guest_os_id
> 2. Use new e820_fixup hook
> 3. Add a BUILD_BUG_ON
> 
> v5:
> 1. use hypervisor_reserve_top_pages
> 2. add a macro for hypercall page mfn
> 3. address other misc comments
> 
> v4:
> 1. Use fixmap
> 2. Follow routines listed in TLFS
> ---
>  xen/arch/x86/guest/hyperv/hyperv.c  | 69 +++--
>  xen/include/asm-x86/guest/hyperv-tlfs.h |  5 +-
>  xen/include/asm-x86/guest/hyperv.h  |  3 ++
>  3 files changed, 72 insertions(+), 5 deletions(-)
> 
> diff --git a/xen/arch/x86/guest/hyperv/hyperv.c
> b/xen/arch/x86/guest/hyperv/hyperv.c
> index 8d38313d7a..2e20a96f30 100644
> --- a/xen/arch/x86/guest/hyperv/hyperv.c
> +++ b/xen/arch/x86/guest/hyperv/hyperv.c
> @@ -19,15 +19,27 @@
>   * Copyright (c) 2019 Microsoft.
>   */
>  #include 
> +#include 
> 
> +#include 
>  #include 
>  #include 
> +#include 
> 
>  struct ms_hyperv_info __read_mostly ms_hyperv;
> 
> -static const struct hypervisor_ops ops = {
> -.name = "Hyper-V",
> -};
> +static uint64_t generate_guest_id(void)
> +{
> +union hv_guest_os_id id = {};
> +
> +id.vendor = HV_XEN_VENDOR_ID;
> +id.major = xen_major_version();
> +id.minor = xen_minor_version();
> +
> +return id.raw;
> +}
> +
> +static const struct hypervisor_ops ops;
> 
>  const struct hypervisor_ops *__init hyperv_probe(void)
>  {
> @@ -72,6 +84,57 @@ const struct hypervisor_ops *__init hyperv_probe(void)
>  return &ops;
>  }
> 
> +static void __init setup_hypercall_page(void)
> +{
> +union hv_x64_msr_hypercall_contents hypercall_msr;
> +union hv_guest_os_id guest_id;
> +unsigned long mfn;
> +
> +BUILD_BUG_ON(HV_HYP_PAGE_SHIFT != PAGE_SHIFT);
> +
> +rdmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id.raw);
> +if ( !guest_id.raw )
> +{
> +guest_id.raw = generate_guest_id();
> +wrmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id.raw);
> +}
> +
> +rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
> +if ( !hypercall_msr.enable )
> +{
> +mfn = HV_HCALL_MFN;
> +hypercall_msr.enable = 1;
> +hypercall_msr.guest_physical_address = mfn;
> +wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
> +}
> +else
> +mfn = hypercall_msr.guest_physical_address;
> +
> +rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
> +BUG_ON(!hypercall_msr.enable);
> +
> +set_fixmap_x(FIX_X_HYPERV_HCALL, mfn << PAGE_SHIFT);
> +}
> +
> +static void __init setup(void)
> +{
> +setup_hypercall_page();
> +}
> +
> +static void __init e820_fixup(struct e820map *e820)
> +{
> +uint64_t s = HV_HCALL_MFN << PAGE_SHIFT;
> +
> +if ( !e820_add_range(e820, s, s + PAGE_SIZE, E820_RESERVED) )
> +panic("Unable to reserve Hyper-V hypercall range\n");
> +}
> +
> +static const struct hypervisor_ops ops = {
> +.name = "Hyper-V",
> +.setup = setup,
> +.e820_fixup = e820_fixup,
> +};
> +
>  /*
>   * Local variables:
>   * mode: C
> diff --git a/xen/include/asm-x86/guest/hyperv-tlfs.h b/xen/include/asm-
> x86/guest/hyperv-tlfs.h
> index 091e25cdd1..0a0f3398c1 100644
> --- a/xen/include/asm-x86/guest/hyperv-tlfs.h
> +++ b/xen/include/asm-x86/guest/hyperv-tlfs.h
> @@ -318,15 +318,16 @@ struct ms_hyperv_tsc_page {
>   *
>   * Bit(s)
>   * 63 - Indicates if the OS is Open Source or not; 1 is Open Source
> - * 62:56 - Os Type; Linux is 0x100
> + * 62:56 - Os Type; Linux 0x1, FreeBSD 0x2, Xen 0x3
>   * 55:48 - Distro specific identificati

Re: [Xen-devel] [PATCH v9 0/4] purge free_shared_domheap_page()

2020-02-06 Thread Durrant, Paul
AFAICT these patches have the necessary A-b/R-b-s, or are there some missing 
that I need to chase?

  Paul

> -Original Message-
> From: Paul Durrant 
> Sent: 03 February 2020 10:57
> To: xen-devel@lists.xenproject.org
> Cc: Durrant, Paul ; Andrew Cooper
> ; George Dunlap ; Ian
> Jackson ; Julien Grall ; Jun
> Nakajima ; Konrad Rzeszutek Wilk
> ; Roger Pau Monné ; Stefano
> Stabellini ; Tim Deegan ; Volodymyr
> Babchuk ; Wei Liu 
> Subject: [PATCH v9 0/4] purge free_shared_domheap_page()
> 
> Paul Durrant (4):
>   x86 / vmx: move teardown from domain_destroy()...
>   add a domain_tot_pages() helper function
>   mm: make pages allocated with MEMF_no_refcount safe to assign
>   x86 / vmx: use a MEMF_no_refcount domheap page for
> APIC_DEFAULT_PHYS_BASE
> 
>  xen/arch/arm/arm64/domctl.c |  2 +-
>  xen/arch/x86/domain.c   |  2 +-
>  xen/arch/x86/hvm/vmx/vmx.c  | 25 ---
>  xen/arch/x86/mm.c   | 15 ++-
>  xen/arch/x86/mm/p2m-pod.c   | 10 ++---
>  xen/arch/x86/mm/shadow/common.c |  2 +-
>  xen/arch/x86/msi.c  |  2 +-
>  xen/arch/x86/numa.c |  2 +-
>  xen/arch/x86/pv/dom0_build.c| 25 ++-
>  xen/arch/x86/pv/domain.c|  2 +-
>  xen/arch/x86/pv/shim.c  |  4 +-
>  xen/common/domctl.c |  2 +-
>  xen/common/grant_table.c|  4 +-
>  xen/common/keyhandler.c |  2 +-
>  xen/common/memory.c |  2 +-
>  xen/common/page_alloc.c | 78 -
>  xen/include/asm-arm/mm.h|  5 ++-
>  xen/include/asm-x86/mm.h|  9 ++--
>  xen/include/public/memory.h |  4 +-
>  xen/include/xen/sched.h | 27 +---
>  20 files changed, 143 insertions(+), 81 deletions(-)
> ---
> Cc: Andrew Cooper 
> Cc: George Dunlap 
> Cc: Ian Jackson 
> Cc: Julien Grall 
> Cc: Jun Nakajima 
> Cc: Konrad Rzeszutek Wilk 
> Cc: "Roger Pau Monné" 
> Cc: Stefano Stabellini 
> Cc: Tim Deegan 
> Cc: Volodymyr Babchuk 
> Cc: Wei Liu 
> --
> 2.20.1

___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v9 0/4] purge free_shared_domheap_page()

2020-02-06 Thread Durrant, Paul
> -Original Message-
> From: Jan Beulich 
> Sent: 06 February 2020 08:46
> To: Durrant, Paul 
> Cc: xen-devel@lists.xenproject.org; Stefano Stabellini
> ; Julien Grall ; Wei Liu
> ; Konrad Rzeszutek Wilk ; Andrew
> Cooper ; Ian Jackson
> ; George Dunlap ; Tim
> Deegan ; Jun Nakajima ; Volodymyr
> Babchuk ; Roger Pau Monné
> 
> Subject: Re: [Xen-devel] [PATCH v9 0/4] purge free_shared_domheap_page()
> 
> On 06.02.2020 09:28, Durrant, Paul wrote:
> > AFAICT these patches have the necessary A-b/R-b-s, or are there some
> missing that I need to chase?
> 
> According to my records ...
> 
> >> -Original Message-
> >> From: Paul Durrant 
> >> Sent: 03 February 2020 10:57
> >>
> >> Paul Durrant (4):
> >>   x86 / vmx: move teardown from domain_destroy()...
> >>   add a domain_tot_pages() helper function
> >>   mm: make pages allocated with MEMF_no_refcount safe to assign
> >>   x86 / vmx: use a MEMF_no_refcount domheap page for
> >> APIC_DEFAULT_PHYS_BASE
> >>
> >>  xen/arch/arm/arm64/domctl.c |  2 +-
> 
> ... this (Arm), ...
> 
> >>  xen/arch/x86/domain.c   |  2 +-
> >>  xen/arch/x86/hvm/vmx/vmx.c  | 25 ---
> 
> ... this (VMX), ...
> 
> >>  xen/arch/x86/mm.c   | 15 ++-
> >>  xen/arch/x86/mm/p2m-pod.c   | 10 ++---
> 
> ... this (MM), ...
> 
> >>  xen/arch/x86/mm/shadow/common.c |  2 +-
> 
> ... this (shadow), ...
> 
> >>  xen/arch/x86/msi.c  |  2 +-
> >>  xen/arch/x86/numa.c |  2 +-
> >>  xen/arch/x86/pv/dom0_build.c| 25 ++-
> >>  xen/arch/x86/pv/domain.c|  2 +-
> >>  xen/arch/x86/pv/shim.c  |  4 +-
> >>  xen/common/domctl.c |  2 +-
> >>  xen/common/grant_table.c|  4 +-
> >>  xen/common/keyhandler.c |  2 +-
> >>  xen/common/memory.c |  2 +-
> >>  xen/common/page_alloc.c | 78 -
> >>  xen/include/asm-arm/mm.h|  5 ++-
> 
> ... and this (Arm again). I think almost all are for patch 2, with
> an Arm one needed on patch 3. If I overlooked any, please point me
> at them.

Ok, thanks. Kevin has completed his acks (patches #1 and #4).

George, Julien, Tim,

  Can I have acks or otherwise, please?

  Paul

> 
> Jan
___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v9 3/4] mm: make pages allocated with MEMF_no_refcount safe to assign

2020-02-06 Thread Durrant, Paul
> -Original Message-
> From: Julien Grall 
> Sent: 06 February 2020 10:04
> To: Durrant, Paul ; xen-devel@lists.xenproject.org
> Cc: Jan Beulich ; Andrew Cooper
> ; George Dunlap ;
> Ian Jackson ; Konrad Rzeszutek Wilk
> ; Stefano Stabellini ; Wei
> Liu ; Volodymyr Babchuk ; Roger
> Pau Monné 
> Subject: Re: [PATCH v9 3/4] mm: make pages allocated with MEMF_no_refcount
> safe to assign
> 
> Hi,
> 
> I am sorry to jump that late in the conversation.
> 
> On 03/02/2020 10:56, Paul Durrant wrote:
> >
> > -if ( unlikely(domain_adjust_tot_pages(d, 1 << order) == (1 <<
> order)) )
> > +if ( !(memflags & MEMF_no_refcount) &&
> > + unlikely(domain_adjust_tot_pages(d, 1 << order) == (1 <<
> order)) )
> >   get_knownalive_domain(d);
> > -}
> >
> >   for ( i = 0; i < (1 << order); i++ )
> >   {
> >   ASSERT(page_get_owner(&pg[i]) == NULL);
> > -ASSERT(!pg[i].count_info);
> >   page_set_owner(&pg[i], d);
> >   smp_wmb(); /* Domain pointer must be visible before updating
> refcnt. */
> > -pg[i].count_info = PGC_allocated | 1;
> > +pg[i].count_info =
> > +(pg[i].count_info & PGC_extra) | PGC_allocated | 1;
> 
> This is technically incorrect because we blindly assume the state of the
> page is inuse (which is thankfully equal to 0).

Assuming the page is inuse seems reasonable at this point.

> 
> See the discussion [1]. This is already an existing bug in the code base
> and I will be taking care of it.

Fair enough; it's a very long standing bug.

> However...
> 
> >   page_list_add_tail(&pg[i], &d->page_list);
> >   }
> >
> > @@ -2315,11 +2338,6 @@ struct page_info *alloc_domheap_pages(
> >
> >   if ( memflags & MEMF_no_owner )
> >   memflags |= MEMF_no_refcount;
> > -else if ( (memflags & MEMF_no_refcount) && d )
> > -{
> > -ASSERT(!(memflags & MEMF_no_refcount));
> > -return NULL;
> > -}
> >
> >   if ( !dma_bitsize )
> >   memflags &= ~MEMF_no_dma;
> > @@ -2332,11 +2350,23 @@ struct page_info *alloc_domheap_pages(
> > memflags, d)) == NULL)) )
> >return NULL;
> >
> > -if ( d && !(memflags & MEMF_no_owner) &&
> > - assign_pages(d, pg, order, memflags) )
> > +if ( d && !(memflags & MEMF_no_owner) )
> >   {
> > -free_heap_pages(pg, order, memflags & MEMF_no_scrub);
> > -return NULL;
> > +if ( memflags & MEMF_no_refcount )
> > +{
> > +unsigned long i;
> > +
> > +for ( i = 0; i < (1ul << order); i++ )
> > +{
> > +ASSERT(!pg[i].count_info);
> > +pg[i].count_info = PGC_extra;
> 
> ... this is pursuing the wrongness of the code above and not safe
> against offlining.
> 
> We could argue this is an already existing bug, however I am a bit
> unease to add more abuse in the code. Jan, what do you think?
> 

I'd consider a straightforward patch-clash. If this patch goes in after yours 
then it needs to be modified accordingly, or vice versa.

  Paul

> > +}
> > +}
> > +if ( assign_pages(d, pg, order, memflags) )
> > +{
> > +free_heap_pages(pg, order, memflags & MEMF_no_scrub);
> > +return NULL;
> > +}
> >   }
> 
> Cheers,
> 
> [1] https://lore.kernel.org/xen-devel/20200204133357.32101-1-
> jul...@xen.org/
> 
> --
> Julien Grall
___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v2] xen/mm: Avoid assuming the page is inuse in assign_pages()

2020-02-06 Thread Durrant, Paul
> -Original Message-
> From: Julien Grall 
> Sent: 06 February 2020 10:39
> To: xen-devel@lists.xenproject.org
> Cc: jul...@xen.org; Durrant, Paul ; Grall, Julien
> 
> Subject: [PATCH v2] xen/mm: Avoid assuming the page is inuse in
> assign_pages()
> 
> From: Julien Grall 
> 
> At the moment, assign_pages() on the page to be inuse (PGC_state_inuse)
> and the state value to be 0.
> 
> However, the code may race with the page offlining code (see
> offline_page()). Depending on the ordering, the page may be in offlining
> state (PGC_state_offlining) before it is assigned to a domain.
> 
> On debug build, this may result to hit the assert or just clobber the
> state. On non-debug build, the state will get clobbered.
> 
> Incidentally the flag PGC_broken will get clobbered as well.
> 
> Grab the heap_lock to prevent a race with offline_page() and keep the
> state and broken flag around.
> 
> Signed-off-by: Julien Grall 

This seems like a reasonable change. I guess having assign_pages() take the 
global lock is no more problem than its existing call to 
domain_adjust_tot_pages() which also takes the same lock.

  Paul

> 
> ---
> Changes in v2:
> - Superseed <20200204133357.32101-1-jul...@xen.org>
> - Fix the race with offline_page()
> ---
>  xen/common/page_alloc.c | 16 ++--
>  1 file changed, 14 insertions(+), 2 deletions(-)
> 
> diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
> index 97902d42c1..a684dbf37c 100644
> --- a/xen/common/page_alloc.c
> +++ b/xen/common/page_alloc.c
> @@ -2283,15 +2283,27 @@ int assign_pages(
>  get_knownalive_domain(d);
>  }
> 
> +spin_lock(&heap_lock);
>  for ( i = 0; i < (1 << order); i++ )
>  {
> +/*
> + * We should only be here if the page is inuse or offlining.
> + * The latter happen if we race with mark_page_offline() as we
> + * don't hold the heap_lock.
> + */
> +ASSERT(page_state_is(&pg[i], inuse) ||
> +   page_state_is(&pg[i], offlining));
> +ASSERT(!(pg[i].count_info & ~(PGC_state | PGC_broken)));
>  ASSERT(page_get_owner(&pg[i]) == NULL);
> -ASSERT(!pg[i].count_info);
>  page_set_owner(&pg[i], d);
>  smp_wmb(); /* Domain pointer must be visible before updating
> refcnt. */
> -pg[i].count_info = PGC_allocated | 1;
> +
> +pg[i].count_info &= PGC_state | PGC_broken;
> +pg[i].count_info |= PGC_allocated | 1;
> +
>  page_list_add_tail(&pg[i], &d->page_list);
>  }
> +spin_unlock(&heap_lock);
> 
>   out:
>  spin_unlock(&d->page_alloc_lock);
> --
> 2.17.1


___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v2] xen/mm: Avoid assuming the page is inuse in assign_pages()

2020-02-06 Thread Durrant, Paul
> -Original Message-
> From: Julien Grall 
> Sent: 06 February 2020 11:17
> To: Durrant, Paul ; xen-devel@lists.xenproject.org
> Cc: Grall, Julien 
> Subject: Re: [PATCH v2] xen/mm: Avoid assuming the page is inuse in
> assign_pages()
> 
> Hi Paul,
> 
> On 06/02/2020 10:52, Durrant, Paul wrote:
> >> -Original Message-
> >> From: Julien Grall 
> >> Sent: 06 February 2020 10:39
> >> To: xen-devel@lists.xenproject.org
> >> Cc: jul...@xen.org; Durrant, Paul ; Grall,
> Julien
> >> 
> >> Subject: [PATCH v2] xen/mm: Avoid assuming the page is inuse in
> >> assign_pages()
> >>
> >> From: Julien Grall 
> >>
> >> At the moment, assign_pages() on the page to be inuse (PGC_state_inuse)
> >> and the state value to be 0.
> >>
> >> However, the code may race with the page offlining code (see
> >> offline_page()). Depending on the ordering, the page may be in
> offlining
> >> state (PGC_state_offlining) before it is assigned to a domain.
> >>
> >> On debug build, this may result to hit the assert or just clobber the
> >> state. On non-debug build, the state will get clobbered.
> >>
> >> Incidentally the flag PGC_broken will get clobbered as well.
> >>
> >> Grab the heap_lock to prevent a race with offline_page() and keep the
> >> state and broken flag around.
> >>
> >> Signed-off-by: Julien Grall 
> >
> > This seems like a reasonable change. I guess having assign_pages() take
> the global lock is no more problem than its existing call to
> domain_adjust_tot_pages() which also takes the same lock.
> 
> That's my understanding. Summarizing our discussion IRL for the other,
> it is not clear whether the lock is enough here.
> 
>  From my understanding the sequence
> 
> pg[i].count_info &= ...;
> pg[i].count_info |= ...;
> 
> could result to multiple read/write from the compiler. We could use a
> single assignment, but I still don't think this prevent the compiler to
> be use multiple read/write.
> 
> The concern would be a race with get_page_owner_and_reference(). If 1 is
> set before the rest of the bits, then you may be able to get the page.
> 
> So I might want to use write_atomic() below. Any opinion?
> 

TBH I wonder if we ought to say that any update to count_info ought to be done 
by a write_atomic (where it's not already done by cmpxchg).

  Paul

> Cheers,
> 
> --
> Julien Grall
___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v4 3/7] x86/HVM: introduce "curr" into hvmemul_rep_{mov, sto}s()

2020-02-06 Thread Durrant, Paul
> -Original Message-
> From: Xen-devel  On Behalf Of Jan
> Beulich
> Sent: 31 January 2020 16:43
> To: xen-devel@lists.xenproject.org
> Cc: George Dunlap ; Andrew Cooper
> ; Roger Pau Monné ; Wei
> Liu ; Paul Durrant 
> Subject: [Xen-devel] [PATCH v4 3/7] x86/HVM: introduce "curr" into
> hvmemul_rep_{mov, sto}s()
> 
> There are a number of uses of "current" already, and more may appear
> down the road. Latch into a local variable.
> 
> At this occasion also drop stray casts from code getting touched anyway.
> 
> Signed-off-by: Jan Beulich 

Reviewed-by: Paul Durrant 

...with one suggestion:

> ---
> v4: New.
> 
> --- a/xen/arch/x86/hvm/emulate.c
> +++ b/xen/arch/x86/hvm/emulate.c
> @@ -1747,7 +1747,8 @@ static int hvmemul_rep_movs(
>  {
>  struct hvm_emulate_ctxt *hvmemul_ctxt =
>  container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
> -struct hvm_vcpu_io *vio = ¤t->arch.hvm.hvm_io;
> +struct vcpu *curr = current;
> +struct hvm_vcpu_io *vio = &curr->arch.hvm.hvm_io;
>  unsigned long saddr, daddr, bytes;
>  paddr_t sgpa, dgpa;
>  uint32_t pfec = PFEC_page_present;
> @@ -1807,8 +1808,8 @@ static int hvmemul_rep_movs(
>  }
> 
>  /* Check for MMIO ops */
> -(void) get_gfn_query_unlocked(current->domain, sgpa >> PAGE_SHIFT,
> &sp2mt);
> -(void) get_gfn_query_unlocked(current->domain, dgpa >> PAGE_SHIFT,
> &dp2mt);
> +get_gfn_query_unlocked(curr->domain, sgpa >> PAGE_SHIFT, &sp2mt);
> +get_gfn_query_unlocked(curr->domain, dgpa >> PAGE_SHIFT, &dp2mt);

Since we have more than one 'curr->domain', I think it would be nice to a 
latched 'currd' domain pointer too.

  Paul

___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v4 7/7] x86/HVM: reduce scope of pfec in hvm_emulate_init_per_insn()

2020-02-06 Thread Durrant, Paul
> -Original Message-
> From: Xen-devel  On Behalf Of Jan
> Beulich
> Sent: 31 January 2020 16:46
> To: xen-devel@lists.xenproject.org
> Cc: George Dunlap ; Andrew Cooper
> ; Roger Pau Monné ; Wei
> Liu ; Paul Durrant 
> Subject: [Xen-devel] [PATCH v4 7/7] x86/HVM: reduce scope of pfec in
> hvm_emulate_init_per_insn()
> 
> It needs calculating only in one out of three cases. Re-structure the
> code a little such that the variable truly gets calculated only when we
> don't get any insn bytes from elsewhere, and hence need to (try to)
> fetch them. Also OR in PFEC_insn_fetch right in the initializer.
> 
> While in this mood, restrict addr's scope as well.
> 
> Signed-off-by: Jan Beulich 

Reviewed-by: Paul Durrant 

> ---
> v4: New.
> 
> --- a/xen/arch/x86/hvm/emulate.c
> +++ b/xen/arch/x86/hvm/emulate.c
> @@ -2762,8 +2762,6 @@ void hvm_emulate_init_per_insn(
>  unsigned int insn_bytes)
>  {
>  struct vcpu *curr = current;
> -unsigned int pfec = PFEC_page_present;
> -unsigned long addr;
> 
>  hvmemul_ctxt->ctxt.lma = hvm_long_mode_active(curr);
> 
> @@ -2778,14 +2776,23 @@ void hvm_emulate_init_per_insn(
>  hvmemul_ctxt->seg_reg[x86_seg_ss].db ? 32 : 16;
>  }
> 
> -if ( hvmemul_ctxt->seg_reg[x86_seg_ss].dpl == 3 )
> -pfec |= PFEC_user_mode;
> -
>  hvmemul_ctxt->insn_buf_eip = hvmemul_ctxt->ctxt.regs->rip;
> -if ( !insn_bytes )
> +
> +if ( insn_bytes )
>  {
> +hvmemul_ctxt->insn_buf_bytes = insn_bytes;
> +memcpy(hvmemul_ctxt->insn_buf, insn_buf, insn_bytes);
> +}
> +else if ( !(hvmemul_ctxt->insn_buf_bytes =
> +hvm_get_insn_bytes(curr, hvmemul_ctxt->insn_buf)) )
> +{
> +unsigned int pfec = PFEC_page_present | PFEC_insn_fetch;
> +unsigned long addr;
> +
> +if ( hvmemul_ctxt->seg_reg[x86_seg_ss].dpl == 3 )
> +pfec |= PFEC_user_mode;
> +
>  hvmemul_ctxt->insn_buf_bytes =
> -hvm_get_insn_bytes(curr, hvmemul_ctxt->insn_buf) ?:
>  (hvm_virtual_to_linear_addr(x86_seg_cs,
>  &hvmemul_ctxt->seg_reg[x86_seg_cs],
>  hvmemul_ctxt->insn_buf_eip,
> @@ -2795,15 +2802,9 @@ void hvm_emulate_init_per_insn(
>  &addr) &&
>   hvm_copy_from_guest_linear(hvmemul_ctxt->insn_buf, addr,
>  sizeof(hvmemul_ctxt->insn_buf),
> -pfec | PFEC_insn_fetch,
> -NULL) == HVMTRANS_okay) ?
> +pfec, NULL) == HVMTRANS_okay) ?
>  sizeof(hvmemul_ctxt->insn_buf) : 0;
>  }
> -else
> -{
> -hvmemul_ctxt->insn_buf_bytes = insn_bytes;
> -memcpy(hvmemul_ctxt->insn_buf, insn_buf, insn_bytes);
> -}
>  }
> 
>  void hvm_emulate_writeback(
> 
> 
> ___
> Xen-devel mailing list
> Xen-devel@lists.xenproject.org
> https://lists.xenproject.org/mailman/listinfo/xen-devel
___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

RE: [PATCH 1/6] x86/iommu: re-arrange arch_iommu to separate common fields...

2020-07-27 Thread Durrant, Paul
> -Original Message-
> From: Jan Beulich 
> Sent: 26 July 2020 09:14
> To: p...@xen.org
> Cc: 'Andrew Cooper' ; 
> xen-devel@lists.xenproject.org; Durrant, Paul
> ; 'Lukasz Hawrylko' ; 
> 'Wei Liu' ;
> 'Roger Pau Monné' ; 'Kevin Tian' 
> Subject: RE: [EXTERNAL] [PATCH 1/6] x86/iommu: re-arrange arch_iommu to 
> separate common fields...
> 
> CAUTION: This email originated from outside of the organization. Do not click 
> links or open
> attachments unless you can confirm the sender and know the content is safe.
> 
> 
> 
> On 24.07.2020 20:49, Paul Durrant wrote:
> >> From: Andrew Cooper 
> >> Sent: 24 July 2020 18:29
> >>
> >> On 24/07/2020 17:46, Paul Durrant wrote:
> >>> diff --git a/xen/include/asm-x86/iommu.h b/xen/include/asm-x86/iommu.h
> >>> index 6c9d5e5632..a7add5208c 100644
> >>> --- a/xen/include/asm-x86/iommu.h
> >>> +++ b/xen/include/asm-x86/iommu.h
> >>> @@ -45,16 +45,23 @@ typedef uint64_t daddr_t;
> >>>
> >>>  struct arch_iommu
> >>>  {
> >>> -u64 pgd_maddr; /* io page directory machine address 
> >>> */
> >>> -spinlock_t mapping_lock;/* io page table lock */
> >>> -int agaw; /* adjusted guest address width, 0 is level 2 30-bit */
> >>> -u64 iommu_bitmap;  /* bitmap of iommu(s) that the domain 
> >>> uses */
> >>> -struct list_head mapped_rmrrs;
> >>> -
> >>> -/* amd iommu support */
> >>> -int paging_mode;
> >>> -struct page_info *root_table;
> >>> -struct guest_iommu *g_iommu;
> >>> +spinlock_t mapping_lock; /* io page table lock */
> >>> +
> >>> +union {
> >>> +/* Intel VT-d */
> >>> +struct {
> >>> +u64 pgd_maddr; /* io page directory machine address */
> >>> +int agaw; /* adjusted guest address width, 0 is level 2 
> >>> 30-bit */
> >>> +u64 iommu_bitmap; /* bitmap of iommu(s) that the domain uses 
> >>> */
> >>> +struct list_head mapped_rmrrs;
> >>> +} vtd;
> >>> +/* AMD IOMMU */
> >>> +struct {
> >>> +int paging_mode;
> >>> +struct page_info *root_table;
> >>> +struct guest_iommu *g_iommu;
> >>> +} amd_iommu;
> >>> +};
> >>
> >> The naming split here is weird.
> >>
> >> Ideally we'd have struct {vtd,amd}_iommu in appropriate headers, and
> >> this would be simply
> >>
> >> union {
> >> struct vtd_iommu vtd;
> >> struct amd_iommu amd;
> >> };
> >>
> >> If this isn't trivial to arrange, can we at least s/amd_iommu/amd/ here ?
> >
> > I was in two minds. I tried to look for a TLA for the AMD IOMMU and 'amd' 
> > seemed a little too non-
> descript. I don't really mind though if there's a strong preference to 
> shorted it.
> 
> +1 for shortening in some way. Even amd_vi would already be better imo,
> albeit I'm with Andrew and would think just amd is fine here (and
> matches how things are in the file system structure).
> 

OK, I'll shorten to 'amd'.

> While at it, may I ask that you also switch the plain "int" fields to
> "unsigned int" - I think that's doable for both of them.
> 

Sure, I can do that.

  Paul

> Jan


RE: [PATCH 2/6] x86/iommu: add common page-table allocator

2020-07-27 Thread Durrant, Paul
> -Original Message-
> From: Andrew Cooper 
> Sent: 24 July 2020 19:24
> To: Paul Durrant ; xen-devel@lists.xenproject.org
> Cc: Durrant, Paul ; Jan Beulich ; 
> Kevin Tian
> ; Wei Liu ; Roger Pau Monné 
> 
> Subject: RE: [EXTERNAL] [PATCH 2/6] x86/iommu: add common page-table allocator
> 
> CAUTION: This email originated from outside of the organization. Do not click 
> links or open
> attachments unless you can confirm the sender and know the content is safe.
> 
> 
> 
> On 24/07/2020 17:46, Paul Durrant wrote:
> > From: Paul Durrant 
> >
> > Instead of having separate page table allocation functions in VT-d and AMD
> > IOMMU code, use a common allocation function in the general x86 code.
> > Also, rather than walking the page tables and using a tasklet to free them
> > during iommu_domain_destroy(), add allocated page table pages to a list and
> > then simply walk the list to free them. This saves ~90 lines of code 
> > overall.
> >
> > NOTE: There is no need to clear and sync PTEs during teardown since the per-
> >   device root entries will have already been cleared (when devices were
> >   de-assigned) so the page tables can no longer be accessed by the 
> > IOMMU.
> >
> > Signed-off-by: Paul Durrant 
> 
> Oh wow - I don't have any polite words for how that code was organised
> before.
> 
> Instead of discussing the ~90 lines saved, what about the removal of a
> global bottleneck (the tasklet) or expand on the removal of redundant
> TLB/cache maintenance?
> 

Ok.

> > diff --git a/xen/drivers/passthrough/amd/pci_amd_iommu.c
> b/xen/drivers/passthrough/amd/pci_amd_iommu.c
> > index c386dc4387..fd9b1e7bd5 100644
> > --- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
> > +++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
> > @@ -378,64 +380,9 @@ static int amd_iommu_assign_device(struct domain *d, 
> > u8 devfn,
> >  return reassign_device(pdev->domain, d, devfn, pdev);
> >  }
> >
> > -static void deallocate_next_page_table(struct page_info *pg, int level)
> > -{
> > -PFN_ORDER(pg) = level;
> > -spin_lock(&iommu_pt_cleanup_lock);
> > -page_list_add_tail(pg, &iommu_pt_cleanup_list);
> > -spin_unlock(&iommu_pt_cleanup_lock);
> > -}
> > -
> > -static void deallocate_page_table(struct page_info *pg)
> > -{
> > -struct amd_iommu_pte *table_vaddr;
> > -unsigned int index, level = PFN_ORDER(pg);
> > -
> > -PFN_ORDER(pg) = 0;
> > -
> > -if ( level <= 1 )
> > -{
> > -free_amd_iommu_pgtable(pg);
> > -return;
> > -}
> > -
> > -table_vaddr = __map_domain_page(pg);
> > -
> > -for ( index = 0; index < PTE_PER_TABLE_SIZE; index++ )
> > -{
> > -struct amd_iommu_pte *pde = &table_vaddr[index];
> > -
> > -if ( pde->mfn && pde->next_level && pde->pr )
> > -{
> > -/* We do not support skip levels yet */
> > -ASSERT(pde->next_level == level - 1);
> > -deallocate_next_page_table(mfn_to_page(_mfn(pde->mfn)),
> > -   pde->next_level);
> > -}
> > -}
> > -
> > -unmap_domain_page(table_vaddr);
> > -free_amd_iommu_pgtable(pg);
> > -}
> > -
> > -static void deallocate_iommu_page_tables(struct domain *d)
> > -{
> > -struct domain_iommu *hd = dom_iommu(d);
> > -
> > -spin_lock(&hd->arch.mapping_lock);
> > -if ( hd->arch.amd_iommu.root_table )
> > -{
> > -deallocate_next_page_table(hd->arch.amd_iommu.root_table,
> > -   hd->arch.amd_iommu.paging_mode);
> 
> I really need to dust off my patch fixing up several bits of dubious
> logic, including the name "paging_mode" which is actually simply the
> number of levels.
> 
> At this point, it will probably be best to get this series in first, and
> for me to rebase.
> 

Ok.

> > -hd->arch.amd_iommu.root_table = NULL;
> > -}
> > -spin_unlock(&hd->arch.mapping_lock);
> > -}
> > -
> > -
> >  static void amd_iommu_domain_destroy(struct domain *d)
> >  {
> > -deallocate_iommu_page_tables(d);
> > +dom_iommu(d)->arch.amd_iommu.root_table = NULL;
> >  amd_iommu_flush_all_pages(d);
> 
> Per your NOTE:, shouldn't this flush call be dropped?
> 

Indeed it should.

> > diff --git a/xen/drivers/passthrough/x

RE: [PATCH 3/6] iommu: remove iommu_lookup_page() and the lookup_page() method...

2020-07-27 Thread Durrant, Paul
> -Original Message-
> From: Jan Beulich 
> Sent: 26 July 2020 09:28
> To: p...@xen.org
> Cc: 'Andrew Cooper' ; 
> xen-devel@lists.xenproject.org; Durrant, Paul
> ; 'Kevin Tian' 
> Subject: RE: [EXTERNAL] [PATCH 3/6] iommu: remove iommu_lookup_page() and the 
> lookup_page() method...
> 
> CAUTION: This email originated from outside of the organization. Do not click 
> links or open
> attachments unless you can confirm the sender and know the content is safe.
> 
> 
> 
> On 24.07.2020 20:53, Paul Durrant wrote:
> >> From: Andrew Cooper 
> >> Sent: 24 July 2020 19:39
> >>
> >> On 24/07/2020 17:46, Paul Durrant wrote:
> >>> From: Paul Durrant 
> >>>
> >>> ... from iommu_ops.
> >>>
> >>> This patch is essentially a reversion of dd93d54f "vtd: add lookup_page 
> >>> method
> >>> to iommu_ops". The code was intended to be used by a patch that has long-
> >>> since been abandoned. Therefore it is dead code and can be removed.
> >>
> >> And by this, you mean the work that you only partial unstreamed, with
> >> the remainder of the feature still very much in use by XenServer?
> >>
> >
> > I thought we basically decided to bin the original PV IOMMU idea though?
> 
> Did we? It's the first time I hear of it, I think.
> 

I circulated a doc. ages ago, while I was still at Citrix: 
https://docs.google.com/document/d/12-z6JD41J_oNrCg_c0yAxGWg5ADBQ8_bSiP_NH6Hqwo/edit?usp=sharing

In there I propose that we don't follow the original idea of keeping a single 
set of per-domain tables but instead have a set of tables (or IOMMU contexts) 
for groups of devices. 'Context 0' is the current set of static 1:1 tables but 
other contexts are manipulated by hypercall so, in this plan, I don't envisage 
the need to look up mappings in the tables in this way... but I guess I can't 
rule it out.

 Paul 



RE: [PATCH 6/6] iommu: stop calling IOMMU page tables 'p2m tables'

2020-07-27 Thread Durrant, Paul
> -Original Message-
> From: Andrew Cooper 
> Sent: 24 July 2020 20:09
> To: Paul Durrant ; xen-devel@lists.xenproject.org
> Cc: Durrant, Paul ; Jan Beulich ; 
> Kevin Tian
> 
> Subject: RE: [EXTERNAL] [PATCH 6/6] iommu: stop calling IOMMU page tables 
> 'p2m tables'
> 
> CAUTION: This email originated from outside of the organization. Do not click 
> links or open
> attachments unless you can confirm the sender and know the content is safe.
> 
> 
> 
> On 24/07/2020 17:46, Paul Durrant wrote:
> > diff --git a/xen/drivers/passthrough/iommu.c 
> > b/xen/drivers/passthrough/iommu.c
> > index 6a3803ff2c..5bc190bf98 100644
> > --- a/xen/drivers/passthrough/iommu.c
> > +++ b/xen/drivers/passthrough/iommu.c
> > @@ -535,12 +535,12 @@ static void iommu_dump_p2m_table(unsigned char key)
> >
> >  if ( iommu_use_hap_pt(d) )
> >  {
> > -printk("\ndomain%d IOMMU p2m table shared with MMU: \n", 
> > d->domain_id);
> > +printk("%pd: IOMMU page tables shared with MMU\n", d);
> 
> Change MMU to CPU?  MMU is very ambiguous in this context.
> 

Actually I could push this into the VT-d code and just say something like 
'shared EPT is enabled'. Would that be less ambiguous?

> >  continue;
> >  }
> >
> > -printk("\ndomain%d IOMMU p2m table: \n", d->domain_id);
> > -ops->dump_p2m_table(d);
> > +printk("%pd: IOMMU page tables: \n", d);
> 
> Drop the trailing whitespace?
> 

Sure.

  Paul

> ~Andrew


RE: [PATCH 4/6] remove remaining uses of iommu_legacy_map/unmap

2020-07-29 Thread Durrant, Paul
> -Original Message-
> From: Jan Beulich 
> Sent: 26 July 2020 09:36
> To: Paul Durrant 
> Cc: xen-devel@lists.xenproject.org; Durrant, Paul ; 
> Andrew Cooper
> ; Wei Liu ; Roger Pau Monné 
> ; George
> Dunlap ; Ian Jackson ; 
> Julien Grall
> ; Stefano Stabellini ; Jun Nakajima 
> ;
> Kevin Tian 
> Subject: RE: [EXTERNAL] [PATCH 4/6] remove remaining uses of 
> iommu_legacy_map/unmap
> 
> CAUTION: This email originated from outside of the organization. Do not click 
> links or open
> attachments unless you can confirm the sender and know the content is safe.
> 
> 
> 
> On 24.07.2020 18:46, Paul Durrant wrote:
> > ---
> >  xen/arch/x86/mm.c   | 22 +++-
> >  xen/arch/x86/mm/p2m-ept.c   | 22 +---
> >  xen/arch/x86/mm/p2m-pt.c| 17 +++
> >  xen/arch/x86/mm/p2m.c   | 28 ++---
> >  xen/arch/x86/x86_64/mm.c| 27 ++--
> >  xen/common/grant_table.c| 36 +---
> >  xen/common/memory.c |  7 +++
> >  xen/drivers/passthrough/iommu.c | 37 +
> >  xen/include/xen/iommu.h | 20 +-
> >  9 files changed, 123 insertions(+), 93 deletions(-)
> 
> Overall more code. I wonder whether a map-and-flush function (named
> differently than the current ones) wouldn't still be worthwhile to
> have.

Agreed but an extra 30 lines is not huge. I'd still like to keep map/unmap and 
flush separate but I'll see if I can reduce the added lines.

> 
> > --- a/xen/common/grant_table.c
> > +++ b/xen/common/grant_table.c
> > @@ -1225,11 +1225,25 @@ map_grant_ref(
> >  kind = IOMMUF_readable;
> >  else
> >  kind = 0;
> > -if ( kind && iommu_legacy_map(ld, _dfn(mfn_x(mfn)), mfn, 0, kind) )
> > +if ( kind )
> >  {
> > -double_gt_unlock(lgt, rgt);
> > -rc = GNTST_general_error;
> > -goto undo_out;
> > +dfn_t dfn = _dfn(mfn_x(mfn));
> > +unsigned int flush_flags = 0;
> > +int err;
> > +
> > +err = iommu_map(ld, dfn, mfn, 0, kind, &flush_flags);
> > +if ( err )
> > +rc = GNTST_general_error;
> > +
> > +err = iommu_iotlb_flush(ld, dfn, 1, flush_flags);
> > +if ( err )
> > +rc = GNTST_general_error;
> > +
> > +if ( rc != GNTST_okay )
> > +{
> > +double_gt_unlock(lgt, rgt);
> > +goto undo_out;
> > +}
> >  }
> 
> The mapping needs to happen with at least ld's lock held, yes. But
> is the same true also for the flushing? Can't (not necessarily
> right in this change) the flush be pulled out of the function and
> instead done once per batch that got processed?
> 

True, the locks need not be held across the flush. I'll have a look at batching 
too.

  Paul

> Jan


RE: [PATCH 5/6] iommu: remove the share_p2m operation

2020-07-29 Thread Durrant, Paul
> -Original Message-
> From: Andrew Cooper 
> Sent: 24 July 2020 20:01
> To: Paul Durrant ; xen-devel@lists.xenproject.org
> Cc: Durrant, Paul ; Jan Beulich ; 
> George Dunlap
> ; Wei Liu ; Roger Pau Monné 
> ; Kevin Tian
> 
> Subject: RE: [EXTERNAL] [PATCH 5/6] iommu: remove the share_p2m operation
> 
> CAUTION: This email originated from outside of the organization. Do not click 
> links or open
> attachments unless you can confirm the sender and know the content is safe.
> 
> 
> 
> On 24/07/2020 17:46, Paul Durrant wrote:
> > From: Paul Durrant 
> >
> > Sharing of HAP tables is VT-d specific so the operation is never defined for
> > AMD IOMMU.
> 
> It's not VT-d specific, and Xen really did used to share on AMD.
> 

Oh, I never thought that ever worked.

> In fact, a remnant of this logic is still present in the form of the
> comment for p2m_type_t explaining why p2m_ram_rw needs to be 0.
> 
> That said, I agree it shouldn't be a hook, because it is statically
> known at domain create time based on flags and hardware properties.
> 

Well, for VT-d that may well change in future ;-)

> >  There's also no need to pro-actively set vtd.pgd_maddr when using
> > shared EPT as it is straightforward to simply define a helper function to
> > return the appropriate value in the shared and non-shared cases.
> 
> It occurs to me that vtd.pgd_maddr and amd.root_table really are the
> same thing, and furthermore are common to all IOMMU implementations on
> any architecture.  Is it worth trying to make them common, rather than
> continuing to let each implementation handle them differently?

I looked at this. The problem really lies in terminology. The 'root table' in 
the VT-d code refers to the single root table which IIRC is called the device 
table in the AMD IOMMU code so, whilst I could convert both to use a single 
common field, I think it's not really that valuable to do so since it's likely 
to make the code slightly more confusing to read (for someone expecting the 
names to tally with the spec).

  Paul

> 
> ~Andrew


RE: [PATCH 5/6] iommu: remove the share_p2m operation

2020-07-29 Thread Durrant, Paul
> -Original Message-
> From: Jan Beulich 
> Sent: 26 July 2020 09:50
> To: Paul Durrant 
> Cc: xen-devel@lists.xenproject.org; Durrant, Paul ; 
> Andrew Cooper
> ; George Dunlap ; Wei 
> Liu ; Roger Pau
> Monné ; Kevin Tian 
> Subject: RE: [EXTERNAL] [PATCH 5/6] iommu: remove the share_p2m operation
> 
> CAUTION: This email originated from outside of the organization. Do not click 
> links or open
> attachments unless you can confirm the sender and know the content is safe.
> 
> 
> 
> On 24.07.2020 18:46, Paul Durrant wrote:
> > --- a/xen/drivers/passthrough/vtd/iommu.c
> > +++ b/xen/drivers/passthrough/vtd/iommu.c
> > @@ -313,6 +313,26 @@ static u64 addr_to_dma_page_maddr(struct domain 
> > *domain, u64 addr, int alloc)
> >  return pte_maddr;
> >  }
> >
> > +static u64 domain_pgd_maddr(struct domain *d)
> 
> uint64_t please.
> 

Ok.

> > +{
> > +struct domain_iommu *hd = dom_iommu(d);
> > +
> > +ASSERT(spin_is_locked(&hd->arch.mapping_lock));
> > +
> > +if ( iommu_use_hap_pt(d) )
> > +{
> > +mfn_t pgd_mfn =
> > +pagetable_get_mfn(p2m_get_pagetable(p2m_get_hostp2m(d)));
> > +
> > +return pagetable_get_paddr(pagetable_from_mfn(pgd_mfn));
> > +}
> > +
> > +if ( !hd->arch.vtd.pgd_maddr )
> > +addr_to_dma_page_maddr(d, 0, 1);
> > +
> > +return hd->arch.vtd.pgd_maddr;
> > +}
> > +
> >  static void iommu_flush_write_buffer(struct vtd_iommu *iommu)
> >  {
> >  u32 val;
> > @@ -1347,22 +1367,17 @@ int domain_context_mapping_one(
> >  {
> >  spin_lock(&hd->arch.mapping_lock);
> >
> > -/* Ensure we have pagetables allocated down to leaf PTE. */
> > -if ( hd->arch.vtd.pgd_maddr == 0 )
> > +pgd_maddr = domain_pgd_maddr(domain);
> > +if ( !pgd_maddr )
> >  {
> > -addr_to_dma_page_maddr(domain, 0, 1);
> > -if ( hd->arch.vtd.pgd_maddr == 0 )
> > -{
> > -nomem:
> > -spin_unlock(&hd->arch.mapping_lock);
> > -spin_unlock(&iommu->lock);
> > -unmap_vtd_domain_page(context_entries);
> > -return -ENOMEM;
> > -}
> > +nomem:
> > +spin_unlock(&hd->arch.mapping_lock);
> > +spin_unlock(&iommu->lock);
> > +unmap_vtd_domain_page(context_entries);
> > +return -ENOMEM;
> >  }
> 
> This renders all calls bogus in shared mode - the function, if
> it ended up getting called nevertheless, would then still alloc
> the root table. Therefore I'd like to suggest that at least all
> its callers get an explicit check. That's really just
> dma_pte_clear_one() as it looks.
> 

Ok, I think I may move this code out into a separate function too since the 
nomem label is slightly awkward, so I'll re-factor it.

  Paul

> Jan


RE: [PATCH v2 08/10] remove remaining uses of iommu_legacy_map/unmap

2020-07-31 Thread Durrant, Paul
> -Original Message-
> From: Paul Durrant 
> Sent: 30 July 2020 15:29
> To: xen-devel@lists.xenproject.org
> Cc: Durrant, Paul ; Jan Beulich ; 
> Andrew Cooper
> ; Wei Liu ; Roger Pau Monné 
> ; George
> Dunlap ; Ian Jackson ; 
> Julien Grall
> ; Stefano Stabellini ; Jun Nakajima 
> ;
> Kevin Tian 
> Subject: [EXTERNAL] [PATCH v2 08/10] remove remaining uses of 
> iommu_legacy_map/unmap
> 
> CAUTION: This email originated from outside of the organization. Do not click 
> links or open
> attachments unless you can confirm the sender and know the content is safe.
> 
> 
> 
> From: Paul Durrant 
> 
> The 'legacy' functions do implicit flushing so amend the callers to do the
> appropriate flushing.
> 
> Unfortunately, because of the structure of the P2M code, we cannot remove
> the per-CPU 'iommu_dont_flush_iotlb' global and the optimization it
> facilitates. It is now checked directly iommu_iotlb_flush(). Also, it is
> now declared as bool (rather than bool_t) and setting/clearing it are no
> longer pointlessly gated on is_iommu_enabled() returning true. (Arguably
> it is also pointless to gate the call to iommu_iotlb_flush() on that
> condition - since it is a no-op in that case - but the if clause allows
> the scope of a stack variable to be restricted).
> 
> NOTE: The code in memory_add() now fails if the number of pages passed to
>   a single call overflows an unsigned int. I don't believe this will
>   ever happen in practice.
> 
> Signed-off-by: Paul Durrant 

I realise now that I completely forgot to address Jan's comments on grant table 
locking and flush batching, so there will be a v3 of at least this patch.

  Paul


RE: [PATCH v2 2/2] x86/hvm: simplify 'mmio_direct' check in epte_get_entry_emt()

2020-07-31 Thread Durrant, Paul
> -Original Message-
> From: Jan Beulich 
> Sent: 31 July 2020 14:41
> To: p...@xen.org
> Cc: xen-devel@lists.xenproject.org; Durrant, Paul ; 
> 'Andrew Cooper'
> ; 'Wei Liu' ; 'Roger Pau Monné' 
> 
> Subject: RE: [EXTERNAL] [PATCH v2 2/2] x86/hvm: simplify 'mmio_direct' check 
> in epte_get_entry_emt()
> 
> CAUTION: This email originated from outside of the organization. Do not click 
> links or open
> attachments unless you can confirm the sender and know the content is safe.
> 
> 
> 
> On 31.07.2020 15:07, Paul Durrant wrote:
> >> -Original Message-
> >> From: Jan Beulich 
> >> Sent: 31 July 2020 13:58
> >> To: Paul Durrant 
> >> Cc: xen-devel@lists.xenproject.org; Paul Durrant ; 
> >> Andrew Cooper
> >> ; Wei Liu ; Roger Pau Monné 
> >> 
> >> Subject: Re: [PATCH v2 2/2] x86/hvm: simplify 'mmio_direct' check in 
> >> epte_get_entry_emt()
> >>
> >> On 31.07.2020 14:39, Paul Durrant wrote:
> >>> From: Paul Durrant 
> >>>
> >>> Re-factor the code to take advantage of the fact that the APIC access 
> >>> page is
> >>> a 'special' page.
> >>
> >> Hmm, that's going quite as far as I was thinking to go: In
> >> particular, you leave in place the set_mmio_p2m_entry() use
> >> in vmx_alloc_vlapic_mapping(). With that replaced, the
> >> re-ordering in epte_get_entry_emt() that you do shouldn't
> >> be necessary; you'd simple drop the checking of the
> >> specific MFN.
> >
> > Ok, it still needs to go in the p2m though so are you suggesting
> > just calling p2m_set_entry() directly?
> 
> Yes, if this works. The main question really is whether there are
> any hidden assumptions elsewhere that this page gets mapped as an
> MMIO one.
> 
> >>> --- a/xen/arch/x86/hvm/mtrr.c
> >>> +++ b/xen/arch/x86/hvm/mtrr.c
> >>> @@ -814,29 +814,22 @@ int epte_get_entry_emt(struct domain *d, unsigned 
> >>> long gfn, mfn_t mfn,
> >>>  return -1;
> >>>  }
> >>>
> >>> -if ( direct_mmio )
> >>> -{
> >>> -if ( (mfn_x(mfn) ^ mfn_x(d->arch.hvm.vmx.apic_access_mfn)) >> 
> >>> order )
> >>> -return MTRR_TYPE_UNCACHABLE;
> >>> -if ( order )
> >>> -return -1;
> >>
> >> ... this part of the logic wants retaining, I think, i.e.
> >> reporting back to the guest that the mapping needs splitting.
> >> I'm afraid I have to withdraw my R-b on patch 1 for this
> >> reason, as the check needs to be added there already.
> >
> > To be clear... You mean I need the:
> >
> > if ( order )
> >   return -1;
> >
> > there?
> 
> Not just - first of all you need to check whether the requested
> range overlaps a special page.

Ok. I'll add that.

  Paul


> 
> Jan


RE: [PATCH v3 1/2] x86/hvm: set 'ipat' in EPT for special pages

2020-07-31 Thread Durrant, Paul
> -Original Message-
> From: Paul Durrant 
> Sent: 31 July 2020 15:26
> To: xen-devel@lists.xenproject.org
> Cc: Durrant, Paul ; Jan Beulich ; 
> Andrew Cooper
> ; Wei Liu ; Roger Pau Monné 
> 
> Subject: [EXTERNAL] [PATCH v3 1/2] x86/hvm: set 'ipat' in EPT for special 
> pages
> 
> CAUTION: This email originated from outside of the organization. Do not click 
> links or open
> attachments unless you can confirm the sender and know the content is safe.
> 
> 
> 
> From: Paul Durrant 
> 
> All non-MMIO ranges (i.e those not mapping real device MMIO regions) that
> map valid MFNs are normally marked MTRR_TYPE_WRBACK and 'ipat' is set. Hence
> when PV drivers running in a guest populate the BAR space of the Xen Platform
> PCI Device with pages such as the Shared Info page or Grant Table pages,
> accesses to these pages will be cachable.
> 
> However, should IOMMU mappings be enabled be enabled for the guest then these
> accesses become uncachable. This has a substantial negative effect on I/O
> throughput of PV devices. Arguably PV drivers should bot be using BAR space to
> host the Shared Info and Grant Table pages but it is currently commonplace for
> them to do this and so this problem needs mitigation. Hence this patch makes
> sure the 'ipat' bit is set for any special page regardless of where in GFN
> space it is mapped.
> 
> NOTE: Clearly this mitigation only applies to Intel EPT. It is not obvious
>   that there is any similar mitigation possible for AMD NPT. Downstreams
>   such as Citrix XenServer have been carrying a patch similar to this for
>   several releases though.
> 
> Signed-off-by: Paul Durrant 

This is missing a hunk. I'll send v4.

  Paul

> ---
> Cc: Jan Beulich 
> Cc: Andrew Cooper 
> Cc: Wei Liu 
> Cc: "Roger Pau Monné" 
> 
> v3:
>  - dropping Jan's R-b
>  - cope with order > 0
> ---
>  xen/arch/x86/hvm/mtrr.c | 11 +++
>  1 file changed, 11 insertions(+)
> 
> diff --git a/xen/arch/x86/hvm/mtrr.c b/xen/arch/x86/hvm/mtrr.c
> index 511c3be1c8..26721f6ee7 100644
> --- a/xen/arch/x86/hvm/mtrr.c
> +++ b/xen/arch/x86/hvm/mtrr.c
> @@ -836,6 +836,17 @@ int epte_get_entry_emt(struct domain *d, unsigned long 
> gfn, mfn_t mfn,
>  return MTRR_TYPE_WRBACK;
>  }
> 
> +for ( i = 0; i < (1ul << order); i++ )
> +{
> +if ( is_special_page(mfn_to_page(mfn_add(mfn, i))) )
> +{
> +if ( order )
> +return -1;
> +*ipat = 1;
> +return MTRR_TYPE_WRBACK;
> +}
> +}
> +
>  gmtrr_mtype = hvm_get_mem_pinned_cacheattr(d, _gfn(gfn), order);
>  if ( gmtrr_mtype >= 0 )
>  {
> --
> 2.20.1



RE: [PATCH 3/4] public/io/netif: specify MTU override node

2020-08-03 Thread Durrant, Paul
> -Original Message-
> From: Jürgen Groß 
> Sent: 03 August 2020 06:09
> To: Paul Durrant ; xen-devel@lists.xenproject.org
> Cc: Durrant, Paul 
> Subject: RE: [EXTERNAL] [PATCH 3/4] public/io/netif: specify MTU override node
> 
> On 30.07.20 21:48, Paul Durrant wrote:
> > From: Paul Durrant 
> >
> > There is currently no documentation to state what MTU a frontend should
> > adertise to its network stack. It has however long been assumed that the
> > default value of 1500 is correct.
> >
> > This patch specifies a mechanism to allow the tools to set the MTU via a
> > xenstore node in the frontend area and states that the absence of that node
> > means the frontend should assume an MTU of 1500 octets.
> >
> > NOTE: The Windows PV frontend has used an MTU sampled from the xenstore
> >node specified in this patch.
> >
> > Signed-off-by: Paul Durrant 
> 
> Can you please update docs/misc/xenstore-paths.pandoc accordingly?

Sure. Given the path is for use by tools then it should indeed be documented 
there as well.

> With that you can have my:
> 
> Reviewed-by: Juergen Gross 
> 

Thanks,

  Paul

> 
> Juergen


RE: [PATCH v3 02/11] x86/iommu: add common page-table allocator

2020-08-03 Thread Durrant, Paul
> -Original Message-
> From: Jan Beulich 
> Sent: 03 August 2020 16:59
> To: Paul Durrant 
> Cc: xen-devel@lists.xenproject.org; Durrant, Paul ; 
> Andrew Cooper
> ; Wei Liu ; Roger Pau Monné 
> 
> Subject: RE: [EXTERNAL] [PATCH v3 02/11] x86/iommu: add common page-table 
> allocator
> 
> CAUTION: This email originated from outside of the organization. Do not click 
> links or open
> attachments unless you can confirm the sender and know the content is safe.
> 
> 
> 
> On 03.08.2020 14:29, Paul Durrant wrote:
> > From: Paul Durrant 
> >
> > Instead of having separate page table allocation functions in VT-d and AMD
> > IOMMU code, we could use a common allocation function in the general x86 
> > code.
> >
> > This patch adds a new allocation function, iommu_alloc_pgtable(), for this
> > purpose. The function adds the page table pages to a list. The pages in this
> > list are then freed by iommu_free_pgtables(), which is called by
> > domain_relinquish_resources() after PCI devices have been de-assigned.
> >
> > Signed-off-by: Paul Durrant 
> > ---
> > Cc: Jan Beulich 
> > Cc: Andrew Cooper 
> > Cc: Wei Liu 
> > Cc: "Roger Pau Monné" 
> >
> > v2:
> >  - This is split out from a larger patch of the same name in v1
> > ---
> >  xen/arch/x86/domain.c   |  9 +-
> >  xen/drivers/passthrough/x86/iommu.c | 50 +
> >  xen/include/asm-x86/iommu.h |  7 
> >  3 files changed, 65 insertions(+), 1 deletion(-)
> >
> > diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
> > index f8084dc9e3..d1ecc7b83b 100644
> > --- a/xen/arch/x86/domain.c
> > +++ b/xen/arch/x86/domain.c
> > @@ -2153,7 +2153,8 @@ int domain_relinquish_resources(struct domain *d)
> >  d->arch.rel_priv = PROG_ ## x; /* Fallthrough */ case PROG_ ## x
> >
> >  enum {
> > -PROG_paging = 1,
> > +PROG_iommu_pagetables = 1,
> > +PROG_paging,
> >  PROG_vcpu_pagetables,
> >  PROG_shared,
> >  PROG_xen,
> 
> Is there a particular reason to make this new item the very first
> one?

It seems like the logical place as it comes straight after device de-assignment.

> 
> > @@ -257,6 +260,53 @@ void __hwdom_init arch_iommu_hwdom_init(struct domain 
> > *d)
> >  return;
> >  }
> >
> > +int iommu_free_pgtables(struct domain *d)
> > +{
> > +struct domain_iommu *hd = dom_iommu(d);
> > +struct page_info *pg;
> > +
> > +while ( (pg = page_list_remove_head(&hd->arch.pgtables.list)) )
> > +{
> > +free_domheap_page(pg);
> > +
> > +if ( general_preempt_check() )
> > +return -ERESTART;
> 
> Perhaps better only check once every so many pages?
> 

Yes, that would be reasonable.

> > +struct page_info *iommu_alloc_pgtable(struct domain *d)
> > +{
> > +struct domain_iommu *hd = dom_iommu(d);
> > +unsigned int memflags = 0;
> > +struct page_info *pg;
> > +void *p;
> > +
> > +#ifdef CONFIG_NUMA
> > +if (hd->node != NUMA_NO_NODE)
> 
> Missing blanks inside parentheses.
> 

Oh yes... bad conversion from ternary statement in previous version.

> > @@ -131,6 +135,9 @@ int pi_update_irte(const struct pi_desc *pi_desc, const 
> > struct pirq *pirq,
> >  iommu_vcall(ops, sync_cache, addr, size);   \
> >  })
> >
> > +int __must_check iommu_free_pgtables(struct domain *d);
> > +struct page_info * __must_check iommu_alloc_pgtable(struct domain *d);
> 
> Commonly we put a blank on the left side of *, but none to its right.
> 

Kind of felt wrong not to separate it from '__must_check'.

  Paul

> Jan


RE: [PATCH v2 4/4] tools/hotplug: modify set_mtu() to inform the frontend via xenstore

2020-08-05 Thread Durrant, Paul
> -Original Message-
> From: Ian Jackson 
> Sent: 05 August 2020 10:31
> To: p...@xen.org
> Cc: xen-devel@lists.xenproject.org; Durrant, Paul ; 
> 'Wei Liu' 
> Subject: RE: [EXTERNAL] [PATCH v2 4/4] tools/hotplug: modify set_mtu() to 
> inform the frontend via
> xenstore
> 
> CAUTION: This email originated from outside of the organization. Do not click 
> links or open
> attachments unless you can confirm the sender and know the content is safe.
> 
> 
> 
> Paul Durrant writes ("RE: [PATCH v2 4/4] tools/hotplug: modify set_mtu() to 
> inform the frontend via
> xenstore"):
> > > -Original Message-
> > > From: Ian Jackson 
> ...
> > > Actually.
> > >
> > > This shouldn't be in the frontend at all, should it ?  In general the
> > > backend writes to the backend and the frontend to the frontend.
> > >
> > > So maybe I need to take back my R-b of
> > >   [PATCH v2 3/4] public/io/netif: specify MTU override node
> > >
> > > Sorry for the confusion.  I seem rather undercaffienated today.
> >
> > Too late. The xenstore node has been used by Windows frontends for the best 
> > part of a decade so we
> can't practically change the
> > path. Another way would be to also modify netback to simply echo the value 
> > from backend into
> frontend, but that seems rather
> > pointless.
> 
> Hmm.  How does this interact with driver domains ?  I think a driver
> domain might not have write access to this node.
> 

That's a good point; I think we will also need to actually write it from libxl 
first in that case.

> Is there a value we can store in it that won't break these Windows
> frontends, that libxl in the toolstack domain could write, before the
> hotplug script runs in the driver domain ?
> 
> > Interestingly libxl does define an 'mtu' field for libxl_device_nic, which 
> > it sets to 1492 in
> libxl__device_nic_setdefault() but
> > never writes it into xenstore. There is even a comment:
> >
> > /* nic->mtu = */
> >
> > in libxl__nic_from_xenstore() which implies it should have been there, but 
> > isn't.
> > I still think picking up the MTU from the bridge is the better way though.
> 
> I agree that the default should come from the bridge.  Ideally there
> would be a way to override it in the config.
> 

Well, I guess we address the driver domain issue in this way too... I will add 
a patch to libxl to write the libxl_device_nic mtu value into xenstore, in both 
backend (where it should always have been) and frontend. I think the current 
setting of 1492 can be changed to 1500 safely (since nothing appears to 
currently use that value). The hotplug script should then have sufficient 
access to update, and a subsequent patch can add a mechanism to set the value 
from the config.

  Paul



RE: [PATCH v2 4/4] tools/hotplug: modify set_mtu() to inform the frontend via xenstore

2020-08-05 Thread Durrant, Paul
> -Original Message-
> From: Ian Jackson 
> Sent: 05 August 2020 11:13
> To: Durrant, Paul 
> Cc: p...@xen.org; xen-devel@lists.xenproject.org; 'Wei Liu' 
> Subject: RE: [EXTERNAL] [PATCH v2 4/4] tools/hotplug: modify set_mtu() to 
> inform the frontend via
> xenstore
> 
> CAUTION: This email originated from outside of the organization. Do not click 
> links or open
> attachments unless you can confirm the sender and know the content is safe.
> 
> 
> 
> Durrant, Paul writes ("RE: [PATCH v2 4/4] tools/hotplug: modify set_mtu() to 
> inform the frontend via
> xenstore"):
> > > -Original Message-
> > > From: Ian Jackson 
> ...
> > Well, I guess we address the driver domain issue in this way
> > too... I will add a patch to libxl to write the libxl_device_nic mtu
> > value into xenstore,
> 
> Do you mean libxl in dom0 or libxl in the driver domain ?
> 
> libxl contains code that runs in both contexts.
> 
> See device_hotplug in libxl_device.c, in particular notice
> if (aodev->dev->backend_domid != domid) {
> 
> If you want the mtu to be read from the bridge, it can only be
> determined by the driver domain, because the bridge is in the driver
> domain.
> 
> In v2 of this series you arrange for the hotplug script to copy the
> mtu from the bridge into the frontend path.  That won't work because
> the hotplug script can't write to that xenstore node because (unlike a
> domo0 backend) a driver domain backend doesn't have write access to
> the frontend so can't create a new node there.
> 
> ISTM that it is correct that it is the hotplug script that does this
> interface setup.  If it weren't for this erroneous use of the frontend
> path I think the right design would be:
>   * toolstack libxl reads the config file to find whether there is an MTU
>   * toolstack libxl writes mtu node in backend iff one was in config
> (and leaves the node absent otherwise)

This is where the 'subsequent patch' comes in (see the end of the email)...

>   * driver domain libxl runs hotplug script
>   * driver domain hotplug script looks for mtu in backend; if there
> isn't one, it gets the value from the bridge and writes it to
> the backend in xenstore
>   * driver domain backend driver reads mtu value from backend path
>   * guest domain frontend driver reads mtu value from backend path
>   * on domain save/migrate, toolstack libxl will record the mtu
> value as the actual configuration so that the migrated domain
> will get the same mtu
> 

That sounds right.

> I don't think I understand what (in these kind of terms) you are
> proposing, in order to support the frontends that want to read the mtu
> from the frontend.

We need some way for creating the frontend node such that the driver domain has 
write access. Presumably there is a suitable place in the toolstack instance of 
libxl to do this. This would mean we either need to write a sentinel 'invalid' 
value or write the default value. In the default case we could still leave the 
backend node absent so the hotplug script will still know whether or not a 
value was set in the cfg.

> 
> >  I think the current setting of 1492 can be changed to 1500 safely
> > (since nothing appears to currently use that value).
> 
> Right, that seems correct to me.
> 
> > The hotplug script should then have sufficient access to update, and
> > a subsequent patch can add a mechanism to set the value from the
> > config.
> 
> I think what I am missing is how this "subsequent patch" would work ?
> Ie what design are we aiming for, that we are now implementing part
> of ?

The subsequent patch would be one that actually acquires the mtu value from the 
vif config, and adds documentation to xl-network-configuration.5.pod, since 
this is currently missing.

  Paul

> 
> Ian.



RE: [PATCH v4 07/14] iommu: make map, unmap and flush all take both an order and a count

2020-08-11 Thread Durrant, Paul
> -Original Message-
> From: Jan Beulich 
> Sent: 06 August 2020 10:57
> To: Paul Durrant 
> Cc: xen-devel@lists.xenproject.org; Durrant, Paul ; 
> Jun Nakajima
> ; Kevin Tian ; Andrew Cooper
> ; George Dunlap ; Wei 
> Liu ; Roger Pau
> Monné ; Ian Jackson ; Julien 
> Grall ;
> Stefano Stabellini ; Volodymyr Babchuk 
> 
> Subject: RE: [EXTERNAL] [PATCH v4 07/14] iommu: make map, unmap and flush all 
> take both an order and a
> count
> 
> CAUTION: This email originated from outside of the organization. Do not click 
> links or open
> attachments unless you can confirm the sender and know the content is safe.
> 
> 
> 
> On 04.08.2020 15:42, Paul Durrant wrote:
> > From: Paul Durrant 
> >
> > At the moment iommu_map() and iommu_unmap() take a page order but not a
> > count, whereas iommu_iotlb_flush() takes a count but not a page order.
> > This patch simply makes them consistent with each other.
> 
> Why can't we do with just a count, where order gets worked out by
> functions knowing how to / wanting to deal with higher order pages?
> 

Yes, that may well be better. The order of the CPU mappings isn't really 
relevant cases where the IOMMU uses different page orders. I'll just move 
everything over to a page count.

  Paul

> > --- a/xen/arch/x86/mm/p2m-ept.c
> > +++ b/xen/arch/x86/mm/p2m-ept.c
> > @@ -843,7 +843,7 @@ out:
> >   need_modify_vtd_table )
> >  {
> >  if ( iommu_use_hap_pt(d) )
> > -rc = iommu_iotlb_flush(d, _dfn(gfn), (1u << order),
> > +rc = iommu_iotlb_flush(d, _dfn(gfn), (1u << order), 1,
> 
> Forgot to drop the "1 << "? (There are then I think two more instances
> further down.)
> 
> > --- a/xen/common/memory.c
> > +++ b/xen/common/memory.c
> > @@ -851,12 +851,12 @@ int xenmem_add_to_physmap(struct domain *d, struct 
> > xen_add_to_physmap *xatp,
> >
> >  this_cpu(iommu_dont_flush_iotlb) = 0;
> >
> > -ret = iommu_iotlb_flush(d, _dfn(xatp->idx - done), done,
> > +ret = iommu_iotlb_flush(d, _dfn(xatp->idx - done), 0, done,
> 
> Arguments wrong way round? (This risk of inverting their order is
> one of the primary reasons why I think we want just a count.) I'm
> also uncertain about the use of 0 vs PAGE_ORDER_4K here.
> 
> >  IOMMU_FLUSHF_added | 
> > IOMMU_FLUSHF_modified);
> >  if ( unlikely(ret) && rc >= 0 )
> >  rc = ret;
> >
> > -ret = iommu_iotlb_flush(d, _dfn(xatp->gpfn - done), done,
> > +ret = iommu_iotlb_flush(d, _dfn(xatp->gpfn - done), 0, done,
> 
> Same here then.
> 
> Jan


RE: [PATCH v4 12/14] vtd: use a bit field for root_entry

2020-08-12 Thread Durrant, Paul
> -Original Message-
> From: Jan Beulich 
> Sent: 06 August 2020 13:34
> To: Paul Durrant 
> Cc: xen-devel@lists.xenproject.org; Durrant, Paul ; 
> Kevin Tian
> 
> Subject: RE: [EXTERNAL] [PATCH v4 12/14] vtd: use a bit field for root_entry
> 
> CAUTION: This email originated from outside of the organization. Do not click 
> links or open
> attachments unless you can confirm the sender and know the content is safe.
> 
> 
> 
> On 04.08.2020 15:42, Paul Durrant wrote:
> > --- a/xen/drivers/passthrough/vtd/iommu.h
> > +++ b/xen/drivers/passthrough/vtd/iommu.h
> > @@ -184,21 +184,28 @@
> >  #define dma_frcd_source_id(c) (c & 0x)
> >  #define dma_frcd_page_addr(d) (d & (((u64)-1) << 12)) /* low 64 bit */
> >
> > -/*
> > - * 0: Present
> > - * 1-11: Reserved
> > - * 12-63: Context Ptr (12 - (haw-1))
> > - * 64-127: Reserved
> > - */
> >  struct root_entry {
> > -u64val;
> > -u64rsvd1;
> > +union {
> > +__uint128_t val;
> 
> I couldn't find a use of this field, and I also can't foresee any.
> Could it be left out?

Yes, probably.

> 
> > +struct { uint64_t lo, hi; };
> > +struct {
> > +/* 0 - 63 */
> > +uint64_t p:1;
> 
> bool?
> 

I'd prefer not to. One of the points of using a bit field (at least from my 
PoV) is that it makes referring back to the spec. much easier, by using 
uint64_t types consistently and hence using bit widths that can be 
straightforwardly summed to give the bit offsets stated in the spec.

> > +uint64_t reserved0:11;
> > +uint64_t ctp:52;
> > +
> > +/* 64 - 127 */
> > +uint64_t reserved1;
> > +};
> > +};
> >  };
> > -#define root_present(root)((root).val & 1)
> > -#define set_root_present(root) do {(root).val |= 1;} while(0)
> > -#define get_context_addr(root) ((root).val & PAGE_MASK_4K)
> > -#define set_root_value(root, value) \
> > -do {(root).val |= ((value) & PAGE_MASK_4K);} while(0)
> > +
> > +#define root_present(r) (r).p
> > +#define set_root_present(r) do { (r).p = 1; } while (0)
> 
> And then "true" here?
> 
> > +#define root_ctp(r) ((r).ctp << PAGE_SHIFT_4K)
> > +#define set_root_ctp(r, val) \
> > +do { (r).ctp = ((val) >> PAGE_SHIFT_4K); } while (0)
> 
> For documentation purposes, can the 2nd macro param be named maddr
> or some such?
> 

Sure.

> > --- a/xen/drivers/passthrough/vtd/x86/ats.c
> > +++ b/xen/drivers/passthrough/vtd/x86/ats.c
> > @@ -74,8 +74,8 @@ int ats_device(const struct pci_dev *pdev, const struct 
> > acpi_drhd_unit *drhd)
> >  static bool device_in_domain(const struct vtd_iommu *iommu,
> >   const struct pci_dev *pdev, uint16_t did)
> >  {
> > -struct root_entry *root_entry;
> > -struct context_entry *ctxt_entry = NULL;
> > +struct root_entry *root_entry, *root_entries = NULL;
> > +struct context_entry *context_entry, *context_entries = NULL;
> 
> Just like root_entry, root_entries doesn't look to need an initializer.
> I'm unconvinced anyway that you now need two variables each:
> unmap_vtd_domain_page() does quite fine with the low 12 bits not all
> being zero, afaict.

Not passing a page aligned address into something that unmaps a page seems a 
little bit fragile though, e.g. if someone happened to add a check in future. 
I'll see if I can drop the initializer though.

  Paul

> 
> Jan


RE: [PATCH] x86 / viridian: remove the viridian_vcpu msg_pending bit mask

2020-08-13 Thread Durrant, Paul
> -Original Message-
> From: Roger Pau Monné 
> Sent: 13 August 2020 11:16
> To: Paul Durrant 
> Cc: xen-devel@lists.xenproject.org; Durrant, Paul ; 
> Wei Liu ; Jan
> Beulich ; Andrew Cooper 
> Subject: RE: [EXTERNAL] [PATCH] x86 / viridian: remove the viridian_vcpu 
> msg_pending bit mask
> 
> CAUTION: This email originated from outside of the organization. Do not click 
> links or open
> attachments unless you can confirm the sender and know the content is safe.
> 
> 
> 
> On Thu, Aug 13, 2020 at 10:57:23AM +0100, Paul Durrant wrote:
> > From: Paul Durrant 
> >
> > The mask does not actually serve a useful purpose as we only use the SynIC
> > for timer messages.
> 
> Oh, I see. I assume it doesn't make sense because there can only be a
> single message pending (a timer one), and hence there isn't much value
> in doing this SynIC pending tracking?

Yes, exactly. We'd potentially need to add code back in if we use the synic for 
other message types (e.g. implementing HvPostMessage, perhaps on top of the 
argo framework).

> 
> > Dropping the mask means that the EOM MSR handler
> > essentially becomes a no-op. This means we can avoid setting 
> > 'message_pending'
> > for timer messages and hence avoid a VMEXIT for the EOM.
> >
> > Signed-off-by: Paul Durrant 
> 
> Reviewed-by: Roger Pau Monné 
> 

Thanks.

> I've got some question below and one nit.
> 
> > ---
> > Cc: Wei Liu 
> > Cc: Jan Beulich 
> > Cc: Andrew Cooper 
> > Cc: "Roger Pau Monné" 
> >
> > This should hopefully simplify Roger's "x86/vlapic: implement EOI callbacks"
> > series a little.
> > ---
> >  xen/arch/x86/hvm/viridian/synic.c  | 24 +---
> >  xen/arch/x86/hvm/vlapic.c  |  2 --
> >  xen/include/asm-x86/hvm/viridian.h |  2 --
> >  3 files changed, 1 insertion(+), 27 deletions(-)
> >
> > diff --git a/xen/arch/x86/hvm/viridian/synic.c 
> > b/xen/arch/x86/hvm/viridian/synic.c
> > index 94a2b88733..22e2df27e5 100644
> > --- a/xen/arch/x86/hvm/viridian/synic.c
> > +++ b/xen/arch/x86/hvm/viridian/synic.c
> > @@ -137,7 +137,6 @@ int viridian_synic_wrmsr(struct vcpu *v, uint32_t idx, 
> > uint64_t val)
> >  if ( !(viridian_feature_mask(d) & HVMPV_synic) )
> >  return X86EMUL_EXCEPTION;
> >
> > -vv->msg_pending = 0;
> >  break;
> >
> >  case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
> > @@ -168,9 +167,6 @@ int viridian_synic_wrmsr(struct vcpu *v, uint32_t idx, 
> > uint64_t val)
> >  printk(XENLOG_G_INFO "%pv: VIRIDIAN SINT%u: vector: %x\n", v, 
> > sintx,
> > vector);
> >
> > -if ( new.polling )
> > -__clear_bit(sintx, &vv->msg_pending);
> > -
> >  *vs = new;
> >  break;
> >  }
> > @@ -334,9 +330,6 @@ bool viridian_synic_deliver_timer_msg(struct vcpu *v, 
> > unsigned int sintx,
> >  .DeliveryTime = delivery,
> >  };
> >
> > -if ( test_bit(sintx, &vv->msg_pending) )
> > -return false;
> > -
> >  /*
> >   * To avoid using an atomic test-and-set, and barrier before calling
> >   * vlapic_set_irq(), this function must be called in context of the
> > @@ -346,12 +339,9 @@ bool viridian_synic_deliver_timer_msg(struct vcpu *v, 
> > unsigned int sintx,
> >
> >  msg += sintx;
> >
> > +/* There is no need to set message_pending as we do not require an EOM 
> > */
> >  if ( msg->header.message_type != HVMSG_NONE )
> 
> I think it's fine to use HVMSG_NONE ATM because Xen only knows about
> timer messages, but long term wouldn't it be better to use
> HVMSG_TIMER_EXPIRED?
> 

I don't think so. The test is supposed to be 'is the slot occupied'. In future 
we could introduce support for new message types and hence a check of != 
HVMSG_NONE seems like the right thing to do.

> > -{
> > -msg->header.message_flags.msg_pending = 1;
> > -__set_bit(sintx, &vv->msg_pending);
> >  return false;
> > -}
> >
> >  msg->header.message_type = HVMSG_TIMER_EXPIRED;
> >  msg->header.message_flags.msg_pending = 0;
> > @@ -380,18 +370,6 @@ bool viridian_synic_is_auto_eoi_sint(const struct vcpu 
> > *v,
> >  return vs->auto_eoi;
> >  }
> >
> > -void viridian_synic_ack_sint(const struct vcpu *v, unsigned int vector)
> > -{
> &g

RE: [PATCH v4 03/14] x86/iommu: convert VT-d code to use new page table allocator

2020-08-14 Thread Durrant, Paul
> -Original Message-
[snip]
> > -static void iommu_free_page_table(struct page_info *pg)
> > -{
> > -unsigned int i, next_level = PFN_ORDER(pg) - 1;
> > -u64 pt_maddr = page_to_maddr(pg);
> > -struct dma_pte *pt_vaddr, *pte;
> > -
> > -PFN_ORDER(pg) = 0;
> > -pt_vaddr = (struct dma_pte *)map_vtd_domain_page(pt_maddr);
> > -
> > -for ( i = 0; i < PTE_NUM; i++ )
> > -{
> > -pte = &pt_vaddr[i];
> > -if ( !dma_pte_present(*pte) )
> > -continue;
> > -
> > -if ( next_level >= 1 )
> > -iommu_free_pagetable(dma_pte_addr(*pte), next_level);
> > -
> > -dma_clear_pte(*pte);
> > -iommu_sync_cache(pte, sizeof(struct dma_pte));
> 
> I didn't see sync_cache in the new iommu_free_pgtables. Is it intended
> (i.e. original flush is meaningless) or overlooked?
> 

The original v1 combined patch had the comment:

NOTE: There is no need to clear and sync PTEs during teardown since the per-
  device root entries will have already been cleared (when devices were
  de-assigned) so the page tables can no longer be accessed by the IOMMU.

I should have included that note in this one. I'll fix in v5.

  Paul

> Thanks
> Kevin



RE: [PATCH v4 06/14] iommu: flush I/O TLB if iommu_map() or iommu_unmap() fail

2020-08-14 Thread Durrant, Paul
> -Original Message-
> From: Tian, Kevin 
> Sent: 14 August 2020 07:53
> To: Paul Durrant ; xen-devel@lists.xenproject.org
> Cc: Durrant, Paul ; Jan Beulich 
> Subject: RE: [EXTERNAL] [PATCH v4 06/14] iommu: flush I/O TLB if iommu_map() 
> or iommu_unmap() fail
> 
> CAUTION: This email originated from outside of the organization. Do not click 
> links or open
> attachments unless you can confirm the sender and know the content is safe.
> 
> 
> 
> > From: Paul Durrant
> > Sent: Tuesday, August 4, 2020 9:42 PM
> >
> > From: Paul Durrant 
> >
> > This patch adds a full I/O TLB flush to the error paths of iommu_map() and
> > iommu_unmap().
> >
> > Without this change callers need constructs such as:
> >
> > rc = iommu_map/unmap(...)
> > err = iommu_flush(...)
> > if ( !rc )
> >   rc = err;
> >
> > With this change, it can be simplified to:
> >
> > rc = iommu_map/unmap(...)
> > if ( !rc )
> >   rc = iommu_flush(...)
> >
> > because, if the map or unmap fails the flush will be unnecessary. This saves
> 
> this statement is different from change in iommu_map...
> 
> > a stack variable and generally makes the call sites tidier.
> >
> > Signed-off-by: Paul Durrant 
> > ---
> > Cc: Jan Beulich 
> >
> > v2:
> >  - New in v2
> > ---
> >  xen/drivers/passthrough/iommu.c | 28 
> >  1 file changed, 12 insertions(+), 16 deletions(-)
> >
> > diff --git a/xen/drivers/passthrough/iommu.c
> > b/xen/drivers/passthrough/iommu.c
> > index 660dc5deb2..e2c0193a09 100644
> > --- a/xen/drivers/passthrough/iommu.c
> > +++ b/xen/drivers/passthrough/iommu.c
> > @@ -274,6 +274,10 @@ int iommu_map(struct domain *d, dfn_t dfn, mfn_t
> > mfn,
> >  break;
> >  }
> >
> > +/* Something went wrong so flush everything and clear flush flags */
> > +if ( unlikely(rc) && iommu_iotlb_flush_all(d, *flush_flags) )
> > +flush_flags = 0;
> > +
> 
> ... earlier you said flush is unnecessary if map fails. But here actually you
> still need to flush everything so it's just sort of moving error-path flush
> within the map function?

Yes, that's actually what's happening. The language in the comment is ambiguous 
I guess. I'll modify it to say

"because, if the map or unmap fails an explicit flush will be unnecessary."

Hopefully that is clearer.

  Paul

> 
> Thanks
> Kevin
> 
> >  return rc;
> >  }
> >
> > @@ -283,14 +287,8 @@ int iommu_legacy_map(struct domain *d, dfn_t dfn,
> > mfn_t mfn,
> >  unsigned int flush_flags = 0;
> >  int rc = iommu_map(d, dfn, mfn, page_order, flags, &flush_flags);
> >
> > -if ( !this_cpu(iommu_dont_flush_iotlb) )
> > -{
> > -int err = iommu_iotlb_flush(d, dfn, (1u << page_order),
> > -flush_flags);
> > -
> > -if ( !rc )
> > -rc = err;
> > -}
> > +if ( !this_cpu(iommu_dont_flush_iotlb) && !rc )
> > +rc = iommu_iotlb_flush(d, dfn, (1u << page_order), flush_flags);
> >
> >  return rc;
> >  }
> > @@ -330,6 +328,10 @@ int iommu_unmap(struct domain *d, dfn_t dfn,
> > unsigned int page_order,
> >  }
> >  }
> >
> > +/* Something went wrong so flush everything and clear flush flags */
> > +if ( unlikely(rc) && iommu_iotlb_flush_all(d, *flush_flags) )
> > +flush_flags = 0;
> > +
> >  return rc;
> >  }
> >
> > @@ -338,14 +340,8 @@ int iommu_legacy_unmap(struct domain *d, dfn_t
> > dfn, unsigned int page_order)
> >  unsigned int flush_flags = 0;
> >  int rc = iommu_unmap(d, dfn, page_order, &flush_flags);
> >
> > -if ( !this_cpu(iommu_dont_flush_iotlb) )
> > -{
> > -int err = iommu_iotlb_flush(d, dfn, (1u << page_order),
> > -flush_flags);
> > -
> > -if ( !rc )
> > -rc = err;
> > -}
> > +if ( !this_cpu(iommu_dont_flush_iotlb) && ! rc )
> > +rc = iommu_iotlb_flush(d, dfn, (1u << page_order), flush_flags);
> >
> >  return rc;
> >  }
> > --
> > 2.20.1
> >




RE: [ANNOUNCE] Call for agenda items for 3 September Community Call @ 15:00 UTC

2020-08-28 Thread Durrant, Paul
> -Original Message-
> From: George Dunlap 
> Sent: 28 August 2020 11:48
> To: Tamas K Lengyel ; intel-...@intel.com; 
> daniel.ki...@oracle.com; Roger
> Pau Monne ; Sergey Dyasli ; 
> Christopher Clark
> ; Rich Persaud ; Kevin 
> Pearson
> ; Juergen Gross ; 
> Durrant, Paul
> ; Ji, John ; 
> edgar.igles...@xilinx.com;
> robin.randh...@arm.com; Artem Mygaiev ; Matt Spencer 
> ;
> anastassios.na...@onapp.com; Stewart Hildebrand 
> ; Volodymyr
> Babchuk ; mirela.simono...@aggios.com; Jarvis 
> Roach
> ; Jeff Kubascik 
> ; Stefano Stabellini
> ; Julien Grall ; Ian Jackson 
> ; Rian
> Quinn ; Daniel P. Smith ; 
> ​​​Doug Goldstein
> ; George Dunlap ; Woodhouse, 
> David ; ​​
> Amit Shah ; ​​​Varad Gautam ; 
> Brian Woods
> ; Robert Townley ; Bobby 
> Eshleman
> ; Olivier Lambert ; 
> Andrew Cooper
> ; Wei Liu 
> Cc: open list:X86 
> Subject: [EXTERNAL] [ANNOUNCE] Call for agenda items for 3 September 
> Community Call @ 15:00 UTC
> 
> CAUTION: This email originated from outside of the organization. Do not click 
> links or open
> attachments unless you can confirm the sender and know the content is safe.
> 
> 
> 
> Hi all,
> 
> The proposed agenda is in 
> https://cryptpad.fr/pad/#/3/pad/edit/f147b0aed8fe26af721ca3cd00425361/ and
> you can edit to add items.  Alternatively, you can reply to this mail 
> directly.

George,

I don't seem to be able to access the doc via that link. Do you need to give 
permissions?

  Paul

> 
> Agenda items appreciated a few days before the call: please put your name 
> besides items if you edit
> the document.
> 
> Note the following administrative conventions for the call:
> * Unless, agreed in the pervious meeting otherwise, the call is on the 1st 
> Thursday of each month at
> 1600 British Time (either GMT or BST)
> * I usually send out a meeting reminder a few days before with a provisional 
> agenda
> 
> * If you want to be CC'ed please add or remove yourself from the 
> sign-up-sheet at
> https://cryptpad.fr/pad/#/2/pad/edit/D9vGzihPxxAOe6RFPz0sRCf+/
> 
> Best Regards
> George
> 
> 
> == Dial-in Information ==
> ## Meeting time
> 15:00 - 16:00 UTC (during BST)
> Further International meeting times:
> https://www.timeanddate.com/worldclock/meetingdetails.html?year=2020&month=5&day=7&hour=15&min=0&sec=0
> &p1=1234&p2=37&p3=224&p4=179
> 
> 
> ## Dial in details
> Web: https://www.gotomeet.me/GeorgeDunlap
> 
> You can also dial in using your phone.
> Access Code: 168-682-109
> 
> China (Toll Free): 4008 811084
> Germany: +49 692 5736 7317
> Poland (Toll Free): 00 800 1124759
> Ukraine (Toll Free): 0 800 50 1733
> United Kingdom: +44 330 221 0088
> United States: +1 (571) 317-3129
> Spain: +34 932 75 2004
> 
> 
> More phone numbers
> Australia: +61 2 9087 3604
> Austria: +43 7 2081 5427
> Argentina (Toll Free): 0 800 444 3375
> Bahrain (Toll Free): 800 81 111
> Belarus (Toll Free): 8 820 0011 0400
> Belgium: +32 28 93 7018
> Brazil (Toll Free): 0 800 047 4906
> Bulgaria (Toll Free): 00800 120 4417
> Canada: +1 (647) 497-9391
> Chile (Toll Free): 800 395 150
> Colombia (Toll Free): 01 800 518 4483
> Czech Republic (Toll Free): 800 500448
> Denmark: +45 32 72 03 82
> Finland: +358 923 17 0568
> France: +33 170 950 594
> Greece (Toll Free): 00 800 4414 3838
> Hong Kong (Toll Free): 30713169906-886-965
> Hungary (Toll Free): (06) 80 986 255
> Iceland (Toll Free): 800 7204
> India (Toll Free): 18002669272
> Indonesia (Toll Free): 007 803 020 5375
> Ireland: +353 15 360 728
> Israel (Toll Free): 1 809 454 830
> Italy: +39 0 247 92 13 01
> Japan (Toll Free): 0 120 663 800
> Korea, Republic of (Toll Free): 00798 14 207 4914
> Luxembourg (Toll Free): 800 85158
> Malaysia (Toll Free): 1 800 81 6854
> Mexico (Toll Free): 01 800 522 1133
> Netherlands: +31 207 941 377
> New Zealand: +64 9 280 6302
> Norway: +47 21 93 37 51
> Panama (Toll Free): 00 800 226 7928
> Peru (Toll Free): 0 800 77023
> Philippines (Toll Free): 1 800 1110 1661
> Portugal (Toll Free): 800 819 575
> Romania (Toll Free): 0 800 410 029
> Russian Federation (Toll Free): 8 800 100 6203
> Saudi Arabia (Toll Free): 800 844 3633
> Singapore (Toll Free): 18007231323
> South Africa (Toll Free): 0 800 555 447
> Sweden: +46 853 527 827
> Switzerland: +41 225 4599 78
> Taiwan (Toll Free): 0 800 666 854
> Thailand (Toll Free): 001 800 011 023
> Turkey (Toll Free): 00 800 4488 23683
> United Arab Emirates (Toll Free): 800 044 40439
> Uruguay (Toll Free): 0004 019 1018
> Viet Nam (Toll Free): 122 80 481
> ​​​
> 
> First GoToMeeting? Let's do a quick system check:
> 
> https://link.gotomeeting.com/system-check


RE: [PATCH v3 0/8] tools: propogate MTU to vif frontends

2020-08-28 Thread Durrant, Paul
> -Original Message-
> From: Wei Liu 
> Sent: 27 August 2020 10:58
> To: Paul Durrant 
> Cc: xen-devel@lists.xenproject.org; Durrant, Paul ; 
> Andrew Cooper
> ; Anthony PERARD ; 
> George Dunlap
> ; Ian Jackson ; Jan 
> Beulich ;
> Julien Grall ; Stefano Stabellini ; 
> Wei Liu 
> Subject: RE: [EXTERNAL] [PATCH v3 0/8] tools: propogate MTU to vif frontends
> 
> CAUTION: This email originated from outside of the organization. Do not click 
> links or open
> attachments unless you can confirm the sender and know the content is safe.
> 
> 
> 
> On Tue, Aug 11, 2020 at 09:01:54AM +0100, Paul Durrant wrote:
> > From: Paul Durrant 
> >
> > This is an expansion from v2 of the series to include the facility to set
> > the MTU in the vif config.
> >
> > There is also one cleanup patch to remove the defunct 'vif2' script.
> >
> > Paul Durrant (8):
> >   public/io/netif: specify MTU override node
> >   tools/hotplug/Linux: re-factor add_to_bridge() in
> > xen-network-common.sh
> >   tools/hotplug/Linux: add remove_from_bridge()
> >   tools/hotplug/Linux: remove code duplication in vif-bridge
> >   libxl: wire the libxl_device_nic 'mtu' value into xenstore
> >   tools/hotplug/Linux: modify set_mtu() to optionally use a configured
> > value...
> >   xl: add 'mtu' option to network configuration
> >   remove netchannel2 hotplug script... ancient history
> >
> 
> Patches 2 - 8:
> Acked-by: Wei Liu 
> 
> (I already acked patch 1)

Thanks Wei.

Ian... ping?

  Paul




RE: [xen-unstable test] 153154: regressions - trouble: broken/fail/pass

2020-09-01 Thread Durrant, Paul
> -Original Message-
> From: Jan Beulich 
> Sent: 31 August 2020 15:26
> To: Roger Pau Monné 
> Cc: Jürgen Groß ; osstest service owner 
> ; xen-
> de...@lists.xenproject.org; Durrant, Paul 
> Subject: RE: [EXTERNAL] [xen-unstable test] 153154: regressions - trouble: 
> broken/fail/pass
> 
> CAUTION: This email originated from outside of the organization. Do not click 
> links or open
> attachments unless you can confirm the sender and know the content is safe.
> 
> 
> 
> On 31.08.2020 16:23, Roger Pau Monné wrote:
> > On Mon, Aug 31, 2020 at 06:40:56AM +0200, Jürgen Groß wrote:
> >> On 30.08.20 18:11, osstest service owner wrote:
> >>> flight 153154 xen-unstable real [real]
> >>> http://logs.test-lab.xenproject.org/osstest/logs/153154/
> >>>
> >>> Regressions :-(
> >>>
> >>> Tests which did not succeed and are blocking,
> >>> including tests which could not be run:
> >>>   test-amd64-amd64-xl-qemut-debianhvm-i386-xsm
> >>> broken
> >>>   test-amd64-amd64-xl-qemut-debianhvm-amd64   
> >>> broken
> >>>   test-amd64-i386-xl-qemut-win7-amd64 
> >>> broken
> >>>   test-amd64-i386-xl-qemut-stubdom-debianhvm-amd64-xsm 10 
> >>> debian-hvm-install fail REGR. vs. 152877
> >>>   test-amd64-amd64-xl-qemut-stubdom-debianhvm-amd64-xsm 10 
> >>> debian-hvm-install fail REGR. vs.
> 152877
> >>
> >> Paul, I suspect some fallout from your hotplug/mtu series?
> >>
> >> The failure in
> >>
> >> http://logs.test-lab.xenproject.org/osstest/logs/153154/test-amd64-amd64-xl-qemut-stubdom-
> debianhvm-amd64-xsm/10.ts-debian-hvm-install.log
> >>
> >> is pointing in this direction IMO.
> >
> > There's a stubdom panic at:
> >
> > http://logs.test-lab.xenproject.org/osstest/logs/153154/test-amd64-amd64-xl-qemut-stubdom-debianhvm-
> amd64-xsm/godello0---var-log-xen-qemu-dm-debianhvm.guest.osstest.log
> >
> > No idea if it's related to the MTU stuff or not. Seems to happen during
> > netfront initialization, so might be related.
> 
> Might also be the netfront_init() changes in / for mini-os then.
> 

That sounds more likely. The only functional change that my series intended to 
make was to add a xenstore entry that AFAIK has only ever been used by Windows 
drivers. I'll go check the logs though.

  Paul

> Jan


RE: [PATCH v5 1/5] xen/common: introduce a new framework for save/restore of 'domain' context

2020-05-22 Thread Durrant, Paul
> -Original Message-
> From: Jan Beulich 
> Sent: 22 May 2020 15:25
> To: Paul Durrant 
> Cc: xen-devel@lists.xenproject.org; Durrant, Paul ; 
> Julien Grall
> ; Andrew Cooper ; George Dunlap 
> ;
> Ian Jackson ; Stefano Stabellini 
> ; Wei Liu
> ; Volodymyr Babchuk ; Roger Pau 
> Monné 
> Subject: RE: [EXTERNAL] [PATCH v5 1/5] xen/common: introduce a new framework 
> for save/restore of
> 'domain' context
> 
> CAUTION: This email originated from outside of the organization. Do not click 
> links or open
> attachments unless you can confirm the sender and know the content is safe.
> 
> 
> 
> On 21.05.2020 18:19, Paul Durrant wrote:
> > To allow enlightened HVM guests (i.e. those that have PV drivers) to be
> > migrated without their co-operation it will be necessary to transfer 'PV'
> > state such as event channel state, grant entry state, etc.
> >
> > Currently there is a framework (entered via the hvm_save/load() functions)
> > that allows a domain's 'HVM' (architectural) state to be transferred but
> > 'PV' state is also common with pure PV guests and so this framework is not
> > really suitable.
> >
> > This patch adds the new public header and low level implementation of a new
> > common framework, entered via the domain_save/load() functions. Subsequent
> > patches will introduce other parts of the framework, and code that will
> > make use of it within the current version of the libxc migration stream.
> >
> > This patch also marks the HVM-only framework as deprecated in favour of the
> > new framework.
> >
> > Signed-off-by: Paul Durrant 
> > Acked-by: Julien Grall 
> 
> Reviewed-by: Jan Beulich 

Thanks.

> with one remark:
> 
> > +int domain_load_end(struct domain_context *c)
> > +{
> > +struct domain *d = c->domain;
> > +size_t len = c->desc.length - c->len;
> > +
> > +while ( c->len != c->desc.length ) /* unconsumed data or pad */
> > +{
> > +uint8_t pad;
> > +int rc = domain_load_data(c, &pad, sizeof(pad));
> > +
> > +if ( rc )
> > +return rc;
> > +
> > +if ( pad )
> > +return -EINVAL;
> > +}
> > +
> > +gdprintk(XENLOG_INFO, "%pd load: %s[%u] +%zu (-%zu)\n", d, c->name,
> > + c->desc.instance, c->len, len);
> 
> Unlike on the save side you assume c->name to be non-NULL here.
> We're not going to crash because of this, but it feels a little
> odd anyway, specifically with the function being non-static
> (albeit on the positive side we have the type being private to
> this file).

Yes, I didn't think it was worth the test since it is supposed to be a "can't 
happen" case. If we're worried about the load handler calling in with a bad 
value of c then we could add a magic number into the struct and ASSERT it is 
correct in domain_[save|load]_[begin|data|end].

  Paul

> 
> Jan


RE: [PATCH v5 4/5] common/domain: add a domain context record for shared_info...

2020-05-22 Thread Durrant, Paul
> -Original Message-
> From: Jan Beulich 
> Sent: 22 May 2020 15:34
> To: Paul Durrant 
> Cc: xen-devel@lists.xenproject.org; Durrant, Paul ; 
> Ian Jackson
> ; Wei Liu ; Andrew Cooper 
> ; George
> Dunlap ; Julien Grall ; Stefano 
> Stabellini
> 
> Subject: RE: [EXTERNAL] [PATCH v5 4/5] common/domain: add a domain context 
> record for shared_info...
> 
> CAUTION: This email originated from outside of the organization. Do not click 
> links or open
> attachments unless you can confirm the sender and know the content is safe.
> 
> 
> 
> On 21.05.2020 18:19, Paul Durrant wrote:
> > @@ -1649,6 +1650,70 @@ int continue_hypercall_on_cpu(
> >  return 0;
> >  }
> >
> > +static int save_shared_info(const struct domain *d, struct domain_context 
> > *c,
> > +bool dry_run)
> > +{
> > +struct domain_shared_info_context ctxt = {
> > +#ifdef CONFIG_COMPAT
> > +.flags = has_32bit_shinfo(d) ? DOMAIN_SAVE_32BIT_SHINFO : 0,
> > +#endif
> > +.buffer_size = sizeof(shared_info_t),
> 
> But this size varies between native and compat.
> 
> > +static int load_shared_info(struct domain *d, struct domain_context *c)
> > +{
> > +struct domain_shared_info_context ctxt;
> > +size_t hdr_size = offsetof(typeof(ctxt), buffer);
> > +unsigned int i;
> > +int rc;
> > +
> > +rc = DOMAIN_LOAD_BEGIN(SHARED_INFO, c, &i);
> > +if ( rc )
> > +return rc;
> > +
> > +if ( i ) /* expect only a single instance */
> > +return -ENXIO;
> > +
> > +rc = domain_load_data(c, &ctxt, hdr_size);
> > +if ( rc )
> > +return rc;
> > +
> > +if ( ctxt.buffer_size != sizeof(shared_info_t) )
> > +return -EINVAL;
> 
> While on the save side things could be left as they are (yet
> I'd prefer a change), this should be flexible enough to allow
> at least the smaller compat size as well in the compat case.

Ok, I guess we can optimize the buffer size down if only the compat version is 
needed. Seems like slightly pointless complexity though.

> I wonder whether any smaller sizes might be acceptable, once
> again with the rest getting zero-padded.
> 

If the need arises to zero extend an older shared_info variant then that can be 
done in future.

> > +if ( ctxt.flags & DOMAIN_SAVE_32BIT_SHINFO )
> > +#ifdef CONFIG_COMPAT
> > +has_32bit_shinfo(d) = true;
> > +#else
> > +return -EINVAL;
> > +#endif
> 
> Am I mis-remembering or was a check lost of the remaining
> flags being zero? If I am, one needs adding in any case, imo.
> 

It wasn't flags before, but you're quite right; they should be zero-checked.

  Paul

> Jan


RE: [PATCH v6 4/5] common/domain: add a domain context record for shared_info...

2020-05-28 Thread Durrant, Paul
> -Original Message-
> From: Jan Beulich 
> Sent: 28 May 2020 10:42
> To: Paul Durrant 
> Cc: xen-devel@lists.xenproject.org; Durrant, Paul ; 
> Ian Jackson
> ; Wei Liu ; Andrew Cooper 
> ; George
> Dunlap ; Julien Grall ; Stefano 
> Stabellini
> 
> Subject: RE: [EXTERNAL] [PATCH v6 4/5] common/domain: add a domain context 
> record for shared_info...
> 
> CAUTION: This email originated from outside of the organization. Do not click 
> links or open
> attachments unless you can confirm the sender and know the content is safe.
> 
> 
> 
> On 27.05.2020 19:34, Paul Durrant wrote:
> > @@ -1649,6 +1650,75 @@ int continue_hypercall_on_cpu(
> >  return 0;
> >  }
> >
> > +static int save_shared_info(const struct domain *d, struct domain_context 
> > *c,
> > +bool dry_run)
> > +{
> > +struct domain_shared_info_context ctxt = {
> > +#ifdef CONFIG_COMPAT
> > +.flags = has_32bit_shinfo(d) ? DOMAIN_SAVE_32BIT_SHINFO : 0,
> > +.buffer_size = has_32bit_shinfo(d) ?
> > +   sizeof(struct compat_shared_info) :
> > +   sizeof(struct shared_info),
> > +#else
> > +.buffer_size = sizeof(struct shared_info),
> > +#endif
> 
> To prevent disconnect between the types used here and the actual
> pointer copied from, I'd have preferred
> 
> #ifdef CONFIG_COMPAT
> .flags = has_32bit_shinfo(d) ? DOMAIN_SAVE_32BIT_SHINFO : 0,
> .buffer_size = has_32bit_shinfo(d) ?
>sizeof(d->shared_info->compat) :
>sizeof(d->shared_info->native),
> #else
> .buffer_size = sizeof(*d->shared_info),
> #endif
> 
> But this is secondary, as the types indeed are very unlikely to go
> out of sync. What's more important is ...
> 
> > +static int load_shared_info(struct domain *d, struct domain_context *c)
> > +{
> > +struct domain_shared_info_context ctxt;
> > +size_t hdr_size = offsetof(typeof(ctxt), buffer);
> > +unsigned int i;
> > +int rc;
> > +
> > +rc = DOMAIN_LOAD_BEGIN(SHARED_INFO, c, &i);
> > +if ( rc )
> > +return rc;
> > +
> > +if ( i ) /* expect only a single instance */
> > +return -ENXIO;
> > +
> > +rc = domain_load_data(c, &ctxt, hdr_size);
> > +if ( rc )
> > +return rc;
> > +
> > +if ( ctxt.buffer_size > sizeof(shared_info_t) ||
> > + (ctxt.flags & ~DOMAIN_SAVE_32BIT_SHINFO) )
> > +return -EINVAL;
> > +
> > +if ( ctxt.flags & DOMAIN_SAVE_32BIT_SHINFO )
> > +#ifdef CONFIG_COMPAT
> > +has_32bit_shinfo(d) = true;
> > +#else
> > +return -EINVAL;
> > +#endif
> > +
> > +rc = domain_load_data(c, d->shared_info, sizeof(shared_info_t));
> > +if ( rc )
> > +return rc;
> 
> ... the still insufficient checking here. You shouldn't accept more
> than sizeof(d->shared_info->compat) worth of data in the compat case
> if you also don't accept more than sizeof(shared_info_t) in the
> native case. To save another round trip I'll offer to make the
> adjustments while committing, but patches 3 and 5 want Andrew's ack
> first anyway.

Ok, thanks.

  Paul

> 
> Jan


Re: [PATCH] IOMMU: make domctl handler tolerate NULL domain

2022-04-19 Thread Durrant, Paul

On 19/04/2022 10:39, Jan Beulich wrote:

Besides the reporter's issue of hitting a NULL deref when !CONFIG_GDBSX,
XEN_DOMCTL_test_assign_device can legitimately end up having NULL passed
here, when the domctl was passed DOMID_INVALID.

Fixes: 71e617a6b8f6 ("use is_iommu_enabled() where appropriate...")
Reported-by: Cheyenne Wills 
Signed-off-by: Jan Beulich 


Reviewed-by: Paul Durrant 



Re: [PATCH v2 1/2] PCI: replace stray uses of PCI_{DEVFN,BDF}2()

2022-04-21 Thread Durrant, Paul

On 21/04/2022 15:26, Jan Beulich wrote:

There's no good reason to use these when we already have a pci_sbdf_t
type object available. This extends to the use of PCI_BUS() in
pci_ecam_map_bus() as well.

No change to generated code (with gcc11 at least, and I have to admit
that I didn't expect compilers to necessarily be able to spot the
optimization potential on the original code).

Signed-off-by: Jan Beulich 
Reviewed-by: Roger Pau Monné 
Reviewed-by: Kevin Tian 


Reviewed-by: Paul Durrant 



Re: [PATCH v2 2/2] PCI: replace "secondary" flavors of PCI_{DEVFN,BDF,SBDF}()

2022-04-21 Thread Durrant, Paul

On 21/04/2022 15:27, Jan Beulich wrote:

At their use sites the numeric suffixes are at least odd to read, first
and foremost for PCI_DEVFN2() where the suffix doesn't even match the
number of arguments. Make use of count_args() such that a single flavor
each suffices (leaving aside helper macros, which aren't supposed to be
used from the outside).

In parse_ppr_log_entry() take the opportunity and drop two local
variables and convert an assignment to an initializer.

In VT-d code fold a number of bus+devfn comparison pairs into a single
BDF comparison.

No change to generated code for the vast majority of the adjustments.

Signed-off-by: Jan Beulich 
Reviewed-by: Roger Pau Monné 
Reviewed-by: Kevin Tian 


Reviewed-by: Paul Durrant 



Re: Ping: [PATCH v2 3/3] AMD/IOMMU: consider hidden devices when flushing device I/O TLBs

2021-10-11 Thread Durrant, Paul

On 11/10/2021 09:04, Jan Beulich wrote:

On 17.09.2021 13:00, Jan Beulich wrote:

Hidden devices are associated with DomXEN but usable by the
hardware domain. Hence they need flushing as well when all devices are
to have flushes invoked.

While there drop a redundant ATS-enabled check and constify the first
parameter of the involved function.

Signed-off-by: Jan Beulich 


The VT-d side equivalent having gone in a while ago, I think it would
be good to have the AMD side on par.



Agreed.

Reviewed-by: Paul Durrant 



Re: [PATCH] x86/HVM: correct cleanup after failed viridian_vcpu_init()

2021-10-18 Thread Durrant, Paul

On 18/10/2021 11:42, Ian Jackson wrote:

Jan Beulich writes ("[PATCH] x86/HVM: correct cleanup after failed 
viridian_vcpu_init()"):

This happens after nestedhvm_vcpu_initialise(), so its effects also need
to be undone.

Fixes: 40a4a9d72d16 ("viridian: add init hooks")
Signed-off-by: Jan Beulich 

--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -1583,7 +1583,7 @@ int hvm_vcpu_initialise(struct vcpu *v)
  
  rc = viridian_vcpu_init(v);

  if ( rc )
-goto fail5;
+goto fail6;


Not acomment about the patch; rather about the code in general.

I have not looked at the context.

But OMG, this is horrific.  How can anyone write code in such an idiom
without writing endless bugs ?



Fairly easily. I think this is the first one due to an incorrect exit label.
Using such an idiom in the Windows PV drivers had meant many issues 
could be easily debugged without further code modification because you 
get an fairly instant audit trail of the route out of any failure condition.


  Paul




Re: [PATCH 1/3] x86/Viridian: fix error code use

2021-11-18 Thread Durrant, Paul

On 18/11/2021 13:13, Jan Beulich wrote:

Both the wrong use of HV_STATUS_* and the return type of
hv_vpset_to_vpmask() can lead to viridian_hypercall()'s
ASSERT_UNREACHABLE() triggering when translating error codes from Xen
to Viridian representation.

Fixes: b4124682db6e ("viridian: add ExProcessorMasks variants of the flush 
hypercalls")
Fixes: 9afa867d42ba ("viridian: add ExProcessorMasks variant of the IPI 
hypercall")
Signed-off-by: Jan Beulich 


Reviewed-by: Paul Durrant 



Re: [PATCH 2/3] x86/Viridian: drop dead variable updates

2021-11-18 Thread Durrant, Paul

On 18/11/2021 13:14, Jan Beulich wrote:

Both hvcall_flush_ex() and hvcall_ipi_ex() update "size" without
subsequently using the value; future compilers may warn about such.
Alongside dropping the updates, shrink the variables' scopes to
demonstrate that there are no outer scope uses.

Signed-off-by: Jan Beulich 


Reviewed-by: Paul Durrant 



Re: [PATCH 3/3] x86/Viridian: fold duplicate vpset retrieval code

2021-11-18 Thread Durrant, Paul

On 18/11/2021 13:14, Jan Beulich wrote:

hvcall_{flush,ipi}_ex() use more almost identical code than what was
isolated into hv_vpset_to_vpmask(). Move that code there as well, to
have just one instance of it. This way all of HV_GENERIC_SET_SPARSE_4K
processing now happens in a single place.

Signed-off-by: Jan Beulich 


Reviewed-by: Paul Durrant 



Re: [PATCH] x86/hvm: Remove callback from paging->flush_tlb() hook

2021-11-18 Thread Durrant, Paul

On 18/11/2021 11:19, Andrew Cooper wrote:

On 18/11/2021 10:45, Jan Beulich wrote:

On 17.11.2021 19:26, Andrew Cooper wrote:

TLB flushing is a hotpath, and function pointer calls are
expensive (especially under repoline) for what amounts to an identity
transform on the data.  Just pass the vcpu_bitmap bitmap directly.

As we use NULL for all rather than none, introduce a flush_vcpu() helper to
avoid the risk of logical errors from opencoding the expression.  This also
means the viridian callers can avoid writing an all-ones bitmap for the
flushing logic to consume.

I think you want to clarify that you convert only one of the two ways of
specifying "all". The other (HV_GENERIC_SET_ALL as consumed by
hv_vpset_to_vpmask()) could also be converted, but this would be a bit
more involved. I have no idea which of the two Windows would typically
use, nor why there are two mechanisms in the first place.


Oh - I'd not spotted that path.  It is well hidden away from
HV_FLUSH_ALL_PROCESSORS.

Giving how windows APIs typically evolve,
HVCALL_FLUSH_VIRTUAL_ADDRESS_{SPACE,LIST} where first.  It has a limit
of 64 vCPUs, and the VpSet sparse form is clearly catering to massive
numbers of vCPUs.

I'd expect to see both paths used, so we ought to see about optimising
that too, in due course.



In my experience, yes, it only uses the sparse version if you have more 
than 64 vCPUs.


  Paul


No functional change.

Signed-off-by: Andrew Cooper 

Reviewed-by: Jan Beulich 


Thanks.

~Andrew






Re: [PATCH v2] x86/hvm: Remove callback from paging->flush_tlb() hook

2021-11-18 Thread Durrant, Paul

On 18/11/2021 18:48, Andrew Cooper wrote:

TLB flushing is a hotpath, and function pointer calls are
expensive (especially under repoline) for what amounts to an identity
transform on the data.  Just pass the vcpu_bitmap bitmap directly.

As we use NULL for all rather than none, introduce a flush_vcpu() helper to
avoid the risk of logical errors from opencoding the expression.  This also
means the viridian callers can avoid writing an all-ones bitmap for the
flushing logic to consume.

No functional change.

Signed-off-by: Andrew Cooper 
Reviewed-by: Jan Beulich 


Reviewed-by: Paul Durrant 



Re: [PATCH v2] xen-mapcache: Avoid entry->lock overflow

2022-01-25 Thread Durrant, Paul

On 24/01/2022 10:44, Ross Lagerwall wrote:

In some cases, a particular mapcache entry may be mapped 256 times
causing the lock field to wrap to 0. For example, this may happen when
using emulated NVME and the guest submits a large scatter-gather write.
At this point, the entry map be remapped causing QEMU to write the wrong
data or crash (since remap is not atomic).

Avoid this overflow by increasing the lock field to a uint32_t and also
detect it and abort rather than continuing regardless.

Signed-off-by: Ross Lagerwall 


Reviewed-by: Paul Durrant 


---
Changes in v2: Change type to uint32_t since there is a hole there
anyway. The struct size remains at 48 bytes on x86_64.

  hw/i386/xen/xen-mapcache.c | 8 +++-
  1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/hw/i386/xen/xen-mapcache.c b/hw/i386/xen/xen-mapcache.c
index bd47c3d672..f2ef977963 100644
--- a/hw/i386/xen/xen-mapcache.c
+++ b/hw/i386/xen/xen-mapcache.c
@@ -52,7 +52,7 @@ typedef struct MapCacheEntry {
  hwaddr paddr_index;
  uint8_t *vaddr_base;
  unsigned long *valid_mapping;
-uint8_t lock;
+uint32_t lock;
  #define XEN_MAPCACHE_ENTRY_DUMMY (1 << 0)
  uint8_t flags;
  hwaddr size;
@@ -355,6 +355,12 @@ tryagain:
  if (lock) {
  MapCacheRev *reventry = g_malloc0(sizeof(MapCacheRev));
  entry->lock++;
+if (entry->lock == 0) {
+fprintf(stderr,
+"mapcache entry lock overflow: "TARGET_FMT_plx" -> %p\n",
+entry->paddr_index, entry->vaddr_base);
+abort();
+}
  reventry->dma = dma;
  reventry->vaddr_req = mapcache->last_entry->vaddr_base + 
address_offset;
  reventry->paddr_index = mapcache->last_entry->paddr_index;





Re: [PATCH] libxl: force netback to wait for hotplug execution before connecting

2022-01-25 Thread Durrant, Paul

On 24/01/2022 16:02, Roger Pau Monne wrote:

By writing an empty "hotplug-status" xenstore node in the backend path
libxl can force Linux netback to wait for hotplug script execution
before proceeding to the 'connected' state.

This is required so that netback doesn't skip state 2 (InitWait) and
thus blocks libxl waiting for such state in order to launch the
hotplug script (see libxl__wait_device_connection).

Reported-by: James Dingwall 
Signed-off-by: Roger Pau Monné 
Tested-by: James Dingwall 
---
Cc: Wei Liu 
Cc: Paul Durrant 


Reviewed-by: Paul Durrant 


---
  tools/libs/light/libxl_nic.c | 7 +++
  1 file changed, 7 insertions(+)

diff --git a/tools/libs/light/libxl_nic.c b/tools/libs/light/libxl_nic.c
index 0b45469dca..0b9e70c9d1 100644
--- a/tools/libs/light/libxl_nic.c
+++ b/tools/libs/light/libxl_nic.c
@@ -248,6 +248,13 @@ static int libxl__set_xenstore_nic(libxl__gc *gc, uint32_t 
domid,
  flexarray_append(ro_front, "mtu");
  flexarray_append(ro_front, GCSPRINTF("%u", nic->mtu));
  
+/*

+ * Force backend to wait for hotplug script execution before switching to
+ * connected state.
+ */
+flexarray_append(back, "hotplug-status");
+flexarray_append(back, "");
+
  return 0;
  }
  





Re: [PATCH] x86emul: recognize CLDEMOTE

2022-01-25 Thread Durrant, Paul

On 25/01/2022 15:08, Jan Beulich wrote:

On 25.01.2022 15:22, Jan Beulich wrote:

We claim to support the insn, but so far the emulator has been handling
it as a NOP.

Signed-off-by: Jan Beulich 


I'm sorry, I should have Cc-ed Paul here as well.



Acked-by: Paul Durrant 


Jan


---
While handling x86emul_cldemote separately in hvmemul_cache_op() means
to carry some redundant code, folding it with CLFLUSH{,OPT} / CLWB
didn't seem very attractive either.

--- a/xen/arch/x86/arch.mk
+++ b/xen/arch/x86/arch.mk
@@ -23,6 +23,7 @@ $(call as-option-add,CFLAGS,CC,"xsaveopt
  $(call as-option-add,CFLAGS,CC,"rdseed %eax",-DHAVE_AS_RDSEED)
  $(call as-option-add,CFLAGS,CC,"clac",-DHAVE_AS_CLAC_STAC)
  $(call as-option-add,CFLAGS,CC,"clwb (%rax)",-DHAVE_AS_CLWB)
+$(call as-option-add,CFLAGS,CC,"cldemote (%rax)",-DHAVE_AS_CLDEMOTE)
  $(call as-option-add,CFLAGS,CC,".equ \"x\"$$(comma)1",-DHAVE_AS_QUOTED_SYM)
  $(call as-option-add,CFLAGS,CC,"invpcid 
(%rax)$$(comma)%rax",-DHAVE_AS_INVPCID)
  $(call as-option-add,CFLAGS,CC,"movdiri %rax$$(comma)(%rax)",-DHAVE_AS_MOVDIR)
--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -2351,6 +2351,28 @@ static int hvmemul_cache_op(
   * to be sensibly used is in (virtualization unaware) firmware.
   */
  break;
+
+case x86emul_cldemote:
+ASSERT(!is_x86_system_segment(seg));
+
+if ( !boot_cpu_has(X86_FEATURE_CLDEMOTE) ||
+ hvmemul_virtual_to_linear(seg, offset, 0, NULL, hvm_access_none,
+   hvmemul_ctxt, &addr) != X86EMUL_OKAY )
+break;
+
+if ( hvmemul_ctxt->seg_reg[x86_seg_ss].dpl == 3 )
+pfec |= PFEC_user_mode;
+
+mapping = hvmemul_map_linear_addr(addr, 0, pfec, hvmemul_ctxt);
+if ( mapping == ERR_PTR(~X86EMUL_EXCEPTION) )
+x86_emul_reset_event(&hvmemul_ctxt->ctxt);
+if ( IS_ERR_OR_NULL(mapping) )
+break;
+
+cldemote(mapping);
+
+hvmemul_unmap_linear_addr(mapping, addr, 0, hvmemul_ctxt);
+break;
  }
  
  return X86EMUL_OKAY;

--- a/xen/arch/x86/include/asm/system.h
+++ b/xen/arch/x86/include/asm/system.h
@@ -37,6 +37,16 @@ static inline void clwb(const void *p)
  #endif
  }
  
+static inline void cldemote(const void *p)

+{
+#if defined(HAVE_AS_CLDEMOTE)
+asm volatile ( "cldemote %0" :: "m" (*(const char *)p) );
+#else
+asm volatile ( ".byte 0x0f, 0x1c, 0x02"
+   :: "d" (p), "m" (*(const char *)p) );
+#endif
+}
+
  #define xchg(ptr,v) \
  ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr
  
--- a/xen/arch/x86/x86_emulate/x86_emulate.c

+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -6290,7 +6290,8 @@ x86_emulate(
  
  case X86EMUL_OPC(0x0f, 0x0d): /* GrpP (prefetch) */

  case X86EMUL_OPC(0x0f, 0x18): /* Grp16 (prefetch/nop) */
-case X86EMUL_OPC(0x0f, 0x19) ... X86EMUL_OPC(0x0f, 0x1f): /* nop */
+case X86EMUL_OPC(0x0f, 0x19) ... X86EMUL_OPC(0x0f, 0x1b): /* nop */
+case X86EMUL_OPC(0x0f, 0x1d) ... X86EMUL_OPC(0x0f, 0x1f): /* nop */
  break;
  
  #ifndef X86EMUL_NO_MMX

@@ -6627,6 +6628,12 @@ x86_emulate(
  
  #endif /* !X86EMUL_NO_SIMD */
  
+case X86EMUL_OPC(0x0f, 0x1c): /* cldemote / nop */

+if ( ctxt->cpuid->feat.cldemote && !vex.pfx && !modrm_reg &&
+ ops->cache_op )
+ops->cache_op(x86emul_cldemote, ea.mem.seg, ea.mem.off, ctxt);
+break;
+
  case X86EMUL_OPC(0x0f, 0x20): /* mov cr,reg */
  case X86EMUL_OPC(0x0f, 0x21): /* mov dr,reg */
  case X86EMUL_OPC(0x0f, 0x22): /* mov reg,cr */
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -177,6 +177,7 @@ enum x86_emulate_fpu_type {
  };
  
  enum x86emul_cache_op {

+x86emul_cldemote,
  x86emul_clflush,
  x86emul_clflushopt,
  x86emul_clwb,









Re: [PATCH] xen-hvm: Allow disabling buffer_io_timer

2022-01-26 Thread Durrant, Paul

On 26/01/2022 13:43, Jason Andryuk wrote:

On Tue, Dec 14, 2021 at 8:40 AM Durrant, Paul  wrote:


On 10/12/2021 11:34, Jason Andryuk wrote:

commit f37f29d31488 "xen: slightly simplify bufioreq handling" hard
coded setting req.count = 1 during initial field setup before the main
loop.  This missed a subtlety that an early exit from the loop when
there are no ioreqs to process, would have req.count == 0 for the return
value.  handle_buffered_io() would then remove state->buffered_io_timer.
Instead handle_buffered_iopage() is basically always returning true and
handle_buffered_io() always re-setting the timer.

Restore the disabling of the timer by introducing a new handled_ioreq
boolean and use as the return value.  The named variable will more
clearly show the intent of the code.

Signed-off-by: Jason Andryuk 


Reviewed-by: Paul Durrant 


Thanks, Paul.

What is the next step for getting this into QEMU?



Anthony, can you queue this?

  Paul


To re-state more plainly, this patch fixes a bug to let QEMU go idle
for longer stretches of time.  Without it, buffer_io_timer continues
to re-arm and fire every 100ms even if there is nothing to do.

Regards,
Jason





Re: [PATCH v2 1/4] IOMMU/x86: switch to alternatives-call patching in further instances

2022-01-28 Thread Durrant, Paul

On 27/01/2022 14:47, Jan Beulich wrote:

This is, once again, to limit the number of indirect calls as much as
possible. The only hook invocation which isn't sensible to convert is
setup(). And of course Arm-only use sites are left alone as well.

Note regarding the introduction / use of local variables in pci.c:
struct pci_dev's involved fields are const. This const propagates, via
typeof(), to the local helper variables in the altcall macros. These
helper variables are, however, used as outputs (and hence can't be
const). In iommu_get_device_group() make use of the new local variables
to also simplify some adjacent code.

Signed-off-by: Jan Beulich 

--- a/xen/drivers/passthrough/iommu.c
+++ b/xen/drivers/passthrough/iommu.c
@@ -198,7 +198,7 @@ int iommu_domain_init(struct domain *d,
  return ret;
  
  hd->platform_ops = iommu_get_ops();

-ret = hd->platform_ops->init(d);
+ret = iommu_call(hd->platform_ops, init, d);
  if ( ret || is_system_domain(d) )
  return ret;
  
@@ -233,7 +233,7 @@ void __hwdom_init iommu_hwdom_init(struc
  
  register_keyhandler('o', &iommu_dump_page_tables, "dump iommu page tables", 0);
  
-hd->platform_ops->hwdom_init(d);

+iommu_vcall(hd->platform_ops, hwdom_init, d);
  }
  
  static void iommu_teardown(struct domain *d)

@@ -576,7 +576,7 @@ int iommu_get_reserved_device_memory(iom
  if ( !ops->get_reserved_device_memory )
  return 0;
  
-return ops->get_reserved_device_memory(func, ctxt);

+return iommu_call(ops, get_reserved_device_memory, func, ctxt);
  }
  
  bool_t iommu_has_feature(struct domain *d, enum iommu_feature feature)

@@ -603,7 +603,7 @@ static void iommu_dump_page_tables(unsig
  continue;
  }
  
-dom_iommu(d)->platform_ops->dump_page_tables(d);

+iommu_vcall(dom_iommu(d)->platform_ops, dump_page_tables, d);
  }
  
  rcu_read_unlock(&domlist_read_lock);

--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -861,15 +861,15 @@ static int deassign_device(struct domain
  devfn += pdev->phantom_stride;
  if ( PCI_SLOT(devfn) != PCI_SLOT(pdev->devfn) )
  break;
-ret = hd->platform_ops->reassign_device(d, target, devfn,
-pci_to_dev(pdev));
+ret = iommu_call(hd->platform_ops, reassign_device, d, target, devfn,
+ pci_to_dev(pdev));
  if ( ret )
  goto out;
  }
  
  devfn = pdev->devfn;

-ret = hd->platform_ops->reassign_device(d, target, devfn,
-pci_to_dev(pdev));
+ret = iommu_call(hd->platform_ops, reassign_device, d, target, devfn,
+ pci_to_dev(pdev));
  if ( ret )
  goto out;
  
@@ -1300,7 +1300,7 @@ static int iommu_add_device(struct pci_d

  {
  const struct domain_iommu *hd;
  int rc;
-u8 devfn;
+unsigned int devfn = pdev->devfn;
  
  if ( !pdev->domain )

  return -EINVAL;
@@ -1311,16 +1311,16 @@ static int iommu_add_device(struct pci_d
  if ( !is_iommu_enabled(pdev->domain) )
  return 0;
  
-rc = hd->platform_ops->add_device(pdev->devfn, pci_to_dev(pdev));

+rc = iommu_call(hd->platform_ops, add_device, devfn, pci_to_dev(pdev));
  if ( rc || !pdev->phantom_stride )
  return rc;
  
-for ( devfn = pdev->devfn ; ; )

+for ( ; ; )
  {
  devfn += pdev->phantom_stride;
  if ( PCI_SLOT(devfn) != PCI_SLOT(pdev->devfn) )
  return 0;
-rc = hd->platform_ops->add_device(devfn, pci_to_dev(pdev));
+rc = iommu_call(hd->platform_ops, add_device, devfn, pci_to_dev(pdev));
  if ( rc )
  printk(XENLOG_WARNING "IOMMU: add %pp failed (%d)\n",
 &pdev->sbdf, rc);
@@ -1341,7 +1341,7 @@ static int iommu_enable_device(struct pc
   !hd->platform_ops->enable_device )
  return 0;
  
-return hd->platform_ops->enable_device(pci_to_dev(pdev));

+return iommu_call(hd->platform_ops, enable_device, pci_to_dev(pdev));
  }
  
  static int iommu_remove_device(struct pci_dev *pdev)

@@ -1363,7 +1363,8 @@ static int iommu_remove_device(struct pc
  devfn += pdev->phantom_stride;
  if ( PCI_SLOT(devfn) != PCI_SLOT(pdev->devfn) )
  break;
-rc = hd->platform_ops->remove_device(devfn, pci_to_dev(pdev));
+rc = iommu_call(hd->platform_ops, remove_device, devfn,
+pci_to_dev(pdev));
  if ( !rc )
  continue;
  
@@ -1371,7 +1372,9 @@ static int iommu_remove_device(struct pc

  return rc;
  }
  
-return hd->platform_ops->remove_device(pdev->devfn, pci_to_dev(pdev));

+devfn = pdev->devfn;
+
+return iommu_call(hd->platform_ops, remove_device, devfn, 
pci_to_dev(pdev));
  }
  
  static int device_assigned(u16 seg, u8 bus, u8 devfn)

@@ -1421,7 +1424,8 @@ static int assign_devi

Re: [PATCH v2 2/4] VT-d / x86: re-arrange cache syncing

2022-01-28 Thread Durrant, Paul

On 27/01/2022 14:48, Jan Beulich wrote:

The actual function should always have lived in core x86 code; move it
there, replacing get_cache_line_size() by readily available (except very
early during boot; see the code comment) data. Also rename the function.

Drop the respective IOMMU hook, (re)introducing a respective boolean
instead. Replace a true and an almost open-coding instance of
iommu_sync_cache().

Signed-off-by: Jan Beulich 


Reviewed-by: Paul Durrant 



Re: [PATCH v2 3/4] VT-d: replace flush_all_cache()

2022-01-28 Thread Durrant, Paul

On 27/01/2022 14:49, Jan Beulich wrote:

Let's use infrastructure we have available instead of an open-coded
wbinvd() invocation.

Signed-off-by: Jan Beulich 


Reviewed-by: Paul Durrant 



Re: [PATCH v2 4/4] IOMMU/PCI: propagate get_device_group_id() failure

2022-01-28 Thread Durrant, Paul

On 27/01/2022 14:49, Jan Beulich wrote:

The VT-d hook can indicate an error, which shouldn't be ignored. Convert
the hook's return value to a proper error code, and let that bubble up.

Signed-off-by: Jan Beulich 


Reviewed-by: Paul Durrant 



Re: tools: propogate MTU to vif frontends (backporting)

2022-02-14 Thread Durrant, Paul

On 14/02/2022 10:48, James Dingwall wrote:

Hi,

I've been backporting this series to xen 4.14 and everything relating to the
backend seems to be working well.  For the frontend I can see the mtu value
published to xenstore but it does't appear to be consumed to set the matching
mtu in the guest.

https://lists.xenproject.org/archives/html/xen-devel/2020-08/msg00458.html

Is the expected solution a custom script running in the guest to make the
necessary change or have I missed something in how this is supposed to
operate?



It depends on your guest. Linux happily gets the MTU from DHCP, but 
Windows does not. Hence:


https://xenbits.xen.org/gitweb/?p=pvdrivers/win/xenvif.git;a=blob;f=src/xenvif/mac.c;;hb=HEAD#l440

Cheers,

  Paul




Re: [PATCH v2 RFC 4/5] x86/ioreq: report extended destination ID support by emulators

2022-02-16 Thread Durrant, Paul

On 16/02/2022 10:30, Roger Pau Monne wrote:

Introduce a new arch specific field to report whether an emulator
supports the Extended Destination ID field, so that the hypervisor can
refrain from exposing the feature if one of the emulators doesn't
support it.

Signed-off-by: Roger Pau Monné 
---
Changes since v1:
  - New in this version.
---
RFC: I find this kind of clumsy. In fact fully emulated devices
should already support Extended Destination ID without any
modifications, because XEN_DMOP_inject_msi gets passed the address and
data fields, so the hypervisor extracts the extended destination ID
from there.

PCI passthrough devices however use xc_domain_update_msi_irq and that
has leaked the gflags parameter in the API, even worse the position
of the flags are hardcoded in QEMU.

Should the clearing of ext_dest_id be limited to the domain using an
IOMMU?

RFC: Only enable ext_dest_id if max_cpu > 128? So the device model is
aware the domain must use ext_dest_id? (implies device model knows
APIC ID = CPU ID * 2)


There is still only a single sync ioreq server page, so 128 vCPUs is the 
max possible.


  Paul




Re: [PATCH v2 RFC 4/5] x86/ioreq: report extended destination ID support by emulators

2022-02-16 Thread Durrant, Paul

On 16/02/2022 11:32, Roger Pau Monné wrote:

On Wed, Feb 16, 2022 at 10:53:58AM +, Durrant, Paul wrote:

On 16/02/2022 10:30, Roger Pau Monne wrote:

Introduce a new arch specific field to report whether an emulator
supports the Extended Destination ID field, so that the hypervisor can
refrain from exposing the feature if one of the emulators doesn't
support it.

Signed-off-by: Roger Pau Monné 
---
Changes since v1:
   - New in this version.
---
RFC: I find this kind of clumsy. In fact fully emulated devices
should already support Extended Destination ID without any
modifications, because XEN_DMOP_inject_msi gets passed the address and
data fields, so the hypervisor extracts the extended destination ID
from there.

PCI passthrough devices however use xc_domain_update_msi_irq and that
has leaked the gflags parameter in the API, even worse the position
of the flags are hardcoded in QEMU.

Should the clearing of ext_dest_id be limited to the domain using an
IOMMU?

RFC: Only enable ext_dest_id if max_cpu > 128? So the device model is
aware the domain must use ext_dest_id? (implies device model knows
APIC ID = CPU ID * 2)


There is still only a single sync ioreq server page, so 128 vCPUs is the max
possible.


Right - so device models wanting to support > 128 vCPUs will already
need to be modified, and hence we could assume that any HVM guests
with > 128 vCPUs is using a device model capable of handling extended
destination ID?



Yes, exactly.

  Paul





Re: [PATCH v2 1/2] Revert "xen-netback: remove 'hotplug-status' once it has served its purpose"

2022-02-23 Thread Durrant, Paul

On 22/02/2022 00:18, Marek Marczykowski-Górecki wrote:

This reverts commit 1f2565780e9b7218cf92c7630130e82dcc0fe9c2.

The 'hotplug-status' node should not be removed as long as the vif
device remains configured. Otherwise the xen-netback would wait for
re-running the network script even if it was already called (in case of
the frontent re-connecting). But also, it _should_ be removed when the
vif device is destroyed (for example when unbinding the driver) -
otherwise hotplug script would not configure the device whenever it
re-appear.

Moving removal of the 'hotplug-status' node was a workaround for nothing
calling network script after xen-netback module is reloaded. But when
vif interface is re-created (on xen-netback unbind/bind for example),
the script should be called, regardless of who does that - currently
this case is not handled by the toolstack, and requires manual
script call. Keeping hotplug-status=connected to skip the call is wrong
and leads to not configured interface.

More discussion at
https://lore.kernel.org/xen-devel/afedd7cb-a291-e773-8b0d-4db9b291f...@ipxe.org/T/#u

Cc: sta...@vger.kernel.org
Signed-off-by: Marek Marczykowski-Górecki 


Reviewed-by: Paul Durrant 



Re: [PATCH v2 2/2] Revert "xen-netback: Check for hotplug-status existence before watching"

2022-02-23 Thread Durrant, Paul

On 22/02/2022 00:18, Marek Marczykowski-Górecki wrote:

This reverts commit 2afeec08ab5c86ae21952151f726bfe184f6b23d.

The reasoning in the commit was wrong - the code expected to setup the
watch even if 'hotplug-status' didn't exist. In fact, it relied on the
watch being fired the first time - to check if maybe 'hotplug-status' is
already set to 'connected'. Not registering a watch for non-existing
path (which is the case if hotplug script hasn't been executed yet),
made the backend not waiting for the hotplug script to execute. This in
turns, made the netfront think the interface is fully operational, while
in fact it was not (the vif interface on xen-netback side might not be
configured yet).

This was a workaround for 'hotplug-status' erroneously being removed.
But since that is reverted now, the workaround is not necessary either.

More discussion at
https://lore.kernel.org/xen-devel/afedd7cb-a291-e773-8b0d-4db9b291f...@ipxe.org/T/#u

Cc: sta...@vger.kernel.org
Signed-off-by: Marek Marczykowski-Górecki 


Reviewed-by: Paul Durrant 



Re: [PATCH 2/2] hw/i386/xen/xen-hvm: Inline xen_piix_pci_write_config_client() and remove it

2022-06-27 Thread Durrant, Paul

On 26/06/2022 10:46, Bernhard Beschow wrote:

xen_piix_pci_write_config_client() is implemented in the xen sub tree and
uses PIIX constants internally, thus creating a direct dependency on
PIIX. Now that xen_set_pci_link_route() is stubbable, the logic of
xen_piix_pci_write_config_client() can be moved to PIIX which resolves
the dependency.

Signed-off-by: Bernhard Beschow 


Reviewed-by: Paul Durrant 



Re: [PATCH 1/2] hw/i386/xen/xen-hvm: Allow for stubbing xen_set_pci_link_route()

2022-06-27 Thread Durrant, Paul

On 26/06/2022 10:46, Bernhard Beschow wrote:

The only user of xen_set_pci_link_route() is
xen_piix_pci_write_config_client() which implements PIIX-specific logic in
the xen namespace. This makes xen-hvm depend on PIIX which could be
avoided if xen_piix_pci_write_config_client() was implemented in PIIX. In
order to do this, xen_set_pci_link_route() needs to be stubbable which
this patch addresses.

Signed-off-by: Bernhard Beschow 


Reviewed-by: Paul Durrant 



Re: [PATCH] xen/netback: handle empty rx queue in xenvif_rx_next_skb()

2022-07-13 Thread Durrant, Paul

On 13/07/2022 09:48, Juergen Gross wrote:

xenvif_rx_next_skb() is expecting the rx queue not being empty, but
in case the loop in xenvif_rx_action() is doing multiple iterations,
the availability of another skb in the rx queue is not being checked.

This can lead to crashes:

[40072.537261] BUG: unable to handle kernel NULL pointer dereference at 
0080
[40072.537407] IP: xenvif_rx_skb+0x23/0x590 [xen_netback]
[40072.537534] PGD 0 P4D 0
[40072.537644] Oops:  [#1] SMP NOPTI
[40072.537749] CPU: 0 PID: 12505 Comm: v1-c40247-q2-gu Not tainted 
4.12.14-122.121-default #1 SLE12-SP5
[40072.537867] Hardware name: HP ProLiant DL580 Gen9/ProLiant DL580 Gen9, BIOS 
U17 11/23/2021
[40072.537999] task: 880433b38100 task.stack: c90043d4
[40072.538112] RIP: e030:xenvif_rx_skb+0x23/0x590 [xen_netback]
[40072.538217] RSP: e02b:c90043d43de0 EFLAGS: 00010246
[40072.538319] RAX:  RBX: c90043cd7cd0 RCX: 00f7
[40072.538430] RDX:  RSI: 0006 RDI: c90043d43df8
[40072.538531] RBP: 003f R08: 77ff8000 R09: 0008
[40072.538644] R10: 7ff0 R11: 08f6 R12: c90043ce2708
[40072.538745] R13:  R14: c90043d43ed0 R15: 88043ea748c0
[40072.538861] FS: () GS:88048460() 
knlGS:
[40072.538988] CS: e033 DS:  ES:  CR0: 80050033
[40072.539088] CR2: 0080 CR3: 000407ac8000 CR4: 00040660
[40072.539211] Call Trace:
[40072.539319] xenvif_rx_action+0x71/0x90 [xen_netback]
[40072.539429] xenvif_kthread_guest_rx+0x14a/0x29c [xen_netback]

Fix that by stopping the loop in case the rx queue becomes empty.

Signed-off-by: Juergen Gross 


Reviewed-by: Paul Durrant 




Re: [PATCH v7 01/14] iommu: add preemption support to iommu_{un,}map()

2022-07-22 Thread Durrant, Paul

On 05/07/2022 13:43, Jan Beulich wrote:

From: Roger Pau Monné 

The loop in iommu_{,un}map() can be arbitrary large, and as such it
needs to handle preemption.  Introduce a new flag that signals whether
the function should do preemption checks, returning the number of pages
that have been processed in case a need for preemption was actually
found.

Note that the cleanup done in iommu_map() can now be incomplete if
preemption has happened, and hence callers would need to take care of
unmapping the whole range (ie: ranges already mapped by previously
preempted calls).  So far none of the callers care about having those
ranges unmapped, so error handling in arch_iommu_hwdom_init() can be
kept as-is.

Note that iommu_legacy_{un,}map() are left without preemption handling:
callers of those interfaces aren't going to modified to pass bigger
chunks, and hence the functions won't be modified as they are legacy and
uses should be replaced with iommu_{un,}map() instead if preemption is
required.

Signed-off-by: Roger Pau Monné 
Signed-off-by: Jan Beulich 


Reviewed-by: Paul Durrant 



Re: [PATCH v7 02/14] IOMMU/x86: perform PV Dom0 mappings in batches

2022-07-22 Thread Durrant, Paul

On 05/07/2022 13:44, Jan Beulich wrote:

For large page mappings to be easily usable (i.e. in particular without
un-shattering of smaller page mappings) and for mapping operations to
then also be more efficient, pass batches of Dom0 memory to iommu_map().
In dom0_construct_pv() and its helpers (covering strict mode) this
additionally requires establishing the type of those pages (albeit with
zero type references).

The earlier establishing of PGT_writable_page | PGT_validated requires
the existing places where this gets done (through get_page_and_type())
to be updated: For pages which actually have a mapping, the type
refcount needs to be 1.

There is actually a related bug that gets fixed here as a side effect:
Typically the last L1 table would get marked as such only after
get_page_and_type(..., PGT_writable_page). While this is fine as far as
refcounting goes, the page did remain mapped in the IOMMU in this case
(when "iommu=dom0-strict").

Signed-off-by: Jan Beulich 


Reviewed-by: Paul Durrant 



Re: [PATCH v7 03/14] IOMMU/x86: support freeing of pagetables

2022-07-22 Thread Durrant, Paul

On 05/07/2022 13:44, Jan Beulich wrote:

For vendor specific code to support superpages we need to be able to
deal with a superpage mapping replacing an intermediate page table (or
hierarchy thereof). Consequently an iommu_alloc_pgtable() counterpart is
needed to free individual page tables while a domain is still alive.
Since the freeing needs to be deferred until after a suitable IOTLB
flush was performed, released page tables get queued for processing by a
tasklet.

Signed-off-by: Jan Beulich 
Reviewed-by: Roger Pau Monné 


Reviewed-by: Paul Durrant 



Re: [PATCH v7 04/14] IOMMU/x86: new command line option to suppress use of superpage mappings

2022-07-22 Thread Durrant, Paul

On 05/07/2022 13:45, Jan Beulich wrote:

Before actually enabling their use, provide a means to suppress it in
case of problems. Note that using the option can also affect the sharing
of page tables in the VT-d / EPT combination: If EPT would use large
page mappings but the option is in effect, page table sharing would be
suppressed (to properly fulfill the admin request).

Requested-by: Roger Pau Monné 
Signed-off-by: Jan Beulich 


Reviewed-by: Paul Durrant 



Re: [PATCH v7 05/14] AMD/IOMMU: allow use of superpage mappings

2022-07-22 Thread Durrant, Paul

On 05/07/2022 13:45, Jan Beulich wrote:

No separate feature flags exist which would control availability of
these; the only restriction is HATS (establishing the maximum number of
page table levels in general), and even that has a lower bound of 4.
Thus we can unconditionally announce 2M and 1G mappings. (Via non-
default page sizes the implementation in principle permits arbitrary
size mappings, but these require multiple identical leaf PTEs to be
written, which isn't all that different from having to write multiple
consecutive PTEs with increasing frame numbers. IMO that's therefore
beneficial only on hardware where suitable TLBs exist; I'm unaware of
such hardware.)

Note that in principle 512G and 256T mappings could also be supported
right away, but the freeing of page tables (to be introduced in
subsequent patches) when replacing a sufficiently populated tree with a
single huge page would need suitable preemption, which will require
extra work.

Signed-off-by: Jan Beulich 
Reviewed-by: Roger Pau Monné 


Reviewed-by: Paul Durrant 



Re: [PATCH v7 06/14] VT-d: allow use of superpage mappings

2022-07-22 Thread Durrant, Paul

On 05/07/2022 13:46, Jan Beulich wrote:

... depending on feature availability (and absence of quirks).

Also make the page table dumping function aware of superpages.

Signed-off-by: Jan Beulich 
Reviewed-by: Kevin Tian 
Reviewed-by: Roger Pau Monné 


Reviewed-by: Paul Durrant 



Re: [PATCH v7 07/14] x86: introduce helper for recording degree of contiguity in page tables

2022-07-22 Thread Durrant, Paul

On 05/07/2022 13:47, Jan Beulich wrote:

This is a re-usable helper (kind of a template) which gets introduced
without users so that the individual subsequent patches introducing such
users can get committed independently of one another.

See the comment at the top of the new file. To demonstrate the effect,
if a page table had just 16 entries, this would be the set of markers
for a page table with fully contiguous mappings:

index  0 1 2 3 4 5 6 7 8 9 A B C D E F
marker 4 0 1 0 2 0 1 0 3 0 1 0 2 0 1 0

"Contiguous" here means not only present entries with successively
increasing MFNs, each one suitably aligned for its slot, but also a
respective number of all non-present entries.

Signed-off-by: Jan Beulich 
Reviewed-by: Roger Pau Monné 


Reviewed-by: Paul Durrant 



Re: [PATCH v7 08/14] IOMMU/x86: prefill newly allocate page tables

2022-07-22 Thread Durrant, Paul

On 05/07/2022 13:47, Jan Beulich wrote:

Page tables are used for two purposes after allocation: They either
start out all empty, or they are filled to replace a superpage.
Subsequently, to replace all empty or fully contiguous page tables,
contiguous sub-regions will be recorded within individual page tables.
Install the initial set of markers immediately after allocation. Make
sure to retain these markers when further populating a page table in
preparation for it to replace a superpage.

The markers are simply 4-bit fields holding the order value of
contiguous entries. To demonstrate this, if a page table had just 16
entries, this would be the initial (fully contiguous) set of markers:

index  0 1 2 3 4 5 6 7 8 9 A B C D E F
marker 4 0 1 0 2 0 1 0 3 0 1 0 2 0 1 0

"Contiguous" here means not only present entries with successively
increasing MFNs, each one suitably aligned for its slot, and identical
attributes, but also a respective number of all non-present (zero except
for the markers) entries.

Signed-off-by: Jan Beulich 
Reviewed-by: Kevin Tian 
Reviewed-by: Roger Pau Monné 


Reviewed-by: Paul Durrant 



Re: [PATCH v7 09/14] AMD/IOMMU: free all-empty page tables

2022-07-22 Thread Durrant, Paul

On 05/07/2022 13:47, Jan Beulich wrote:

When a page table ends up with no present entries left, it can be
replaced by a non-present entry at the next higher level. The page table
itself can then be scheduled for freeing.

Note that while its output isn't used there yet,
pt_update_contig_markers() right away needs to be called in all places
where entries get updated, not just the one where entries get cleared.

Signed-off-by: Jan Beulich 
Reviewed-by: Roger Pau Monné 


Reviewed-by: Paul Durrant 



Re: [PATCH v7 10/14] VT-d: free all-empty page tables

2022-07-22 Thread Durrant, Paul

On 05/07/2022 13:48, Jan Beulich wrote:

When a page table ends up with no present entries left, it can be
replaced by a non-present entry at the next higher level. The page table
itself can then be scheduled for freeing.

Note that while its output isn't used there yet,
pt_update_contig_markers() right away needs to be called in all places
where entries get updated, not just the one where entries get cleared.

Note further that while pt_update_contig_markers() updates perhaps
several PTEs within the table, since these are changes to "avail" bits
only I do not think that cache flushing would be needed afterwards. Such
cache flushing (of entire pages, unless adding yet more logic to me more
selective) would be quite noticable performance-wise (very prominent
during Dom0 boot).

Also note that cache sync-ing is likely more strict than necessary. This
is both to be on the safe side as well as to maintain the pattern of all
updates of (potentially) live tables being accompanied by a flush (if so
needed).

Signed-off-by: Jan Beulich 
Reviewed-by: Kevin Tian 
Reviewed-by: Roger Pau Monné 


Reviewed-by: Paul Durrant 



Re: [PATCH v7 11/14] AMD/IOMMU: replace all-contiguous page tables by superpage mappings

2022-07-22 Thread Durrant, Paul

On 05/07/2022 13:48, Jan Beulich wrote:

When a page table ends up with all contiguous entries (including all
identical attributes), it can be replaced by a superpage entry at the
next higher level. The page table itself can then be scheduled for
freeing.

Signed-off-by: Jan Beulich 
Reviewed-by: Roger Pau Monné 


Reviewed-by: Paul Durrant 



Re: [PATCH v7 12/14] VT-d: replace all-contiguous page tables by superpage mappings

2022-07-22 Thread Durrant, Paul

On 05/07/2022 13:48, Jan Beulich wrote:

When a page table ends up with all contiguous entries (including all
identical attributes), it can be replaced by a superpage entry at the
next higher level. The page table itself can then be scheduled for
freeing.

The adjustment to LEVEL_MASK is merely to avoid leaving a latent trap
for whenever we (and obviously hardware) start supporting 512G mappings.

Note that cache sync-ing is likely more strict than necessary. This is
both to be on the safe side as well as to maintain the pattern of all
updates of (potentially) live tables being accompanied by a flush (if so
needed).

Signed-off-by: Jan Beulich 
Reviewed-by: Kevin Tian 
Reviewed-by: Roger Pau Monné 


Reviewed-by: Paul Durrant 



Re: [PATCH v7 13/14] IOMMU/x86: add perf counters for page table splitting / coalescing

2022-07-22 Thread Durrant, Paul

On 05/07/2022 13:49, Jan Beulich wrote:

Signed-off-by: Jan Beulich 
Reviewed-by: Kevin tian 
Reviewed-by: Roger Pau Monné 


Reviewed-by: Paul Durrant 



Re: [PATCH v7 14/14] VT-d: fold dma_pte_clear_one() into its only caller

2022-07-22 Thread Durrant, Paul

On 05/07/2022 13:49, Jan Beulich wrote:

This way intel_iommu_unmap_page() ends up quite a bit more similar to
intel_iommu_map_page().

No functional change intended.

Signed-off-by: Jan Beulich 
Reviewed-by: Kevin Tian 
Reviewed-by: Roger Pau Monné 


Reviewed-by: Paul Durrant 



RE: [PATCH] xen-netback: Check for hotplug-status existence before watching

2021-05-11 Thread Durrant, Paul
> -Original Message-
> From: Marek Marczykowski-Górecki 
> Sent: 10 May 2021 20:43
> To: Michael Brown ; p...@xen.org
> Cc: p...@xen.org; xen-devel@lists.xenproject.org; net...@vger.kernel.org; 
> wei@kernel.org; Durrant,
> Paul 
> Subject: RE: [EXTERNAL] [PATCH] xen-netback: Check for hotplug-status 
> existence before watching
> 
> On Mon, May 10, 2021 at 08:06:55PM +0100, Michael Brown wrote:
> > If you have a suggested patch, I'm happy to test that it doesn't reintroduce
> > the regression bug that was fixed by this commit.
> 
> Actually, I've just tested with a simple reloading xen-netfront module. It
> seems in this case, the hotplug script is not re-executed. In fact, I
> think it should not be re-executed at all, since the vif interface
> remains in place (it just gets NO-CARRIER flag).
> 
> This brings a question, why removing hotplug-status in the first place?
> The interface remains correctly configured by the hotplug script after
> all. From the commit message:
> 
> xen-netback: remove 'hotplug-status' once it has served its purpose
> 
> Removing the 'hotplug-status' node in netback_remove() is wrong; the 
> script
> may not have completed. Only remove the node once the watch has fired and
> has been unregistered.
> 
> I think the intention was to remove 'hotplug-status' node _later_ in
> case of quickly adding and removing the interface. Is that right, Paul?

The removal was done to allow unbind/bind to function correctly. IIRC before 
the original patch doing a bind would stall forever waiting for the hotplug 
status to change, which would never happen.

> In that case, letting hotplug_status_changed() remove the entry wont
> work, because the watch was unregistered few lines earlier in
> netback_remove(). And keeping the watch is not an option, because the
> whole backend_info struct is going to be free-ed already.
> 
> If my guess about the original reason for the change is right, I think
> it should be fixed at the hotplug script level - it should check if the
> device is still there before writing 'hotplug-status' node.
> I'm not sure if doing it race-free is possible from a shell script (I think it
> requires doing xenstore read _and_ write in a single transaction). But
> in the worst case, the aftermath of loosing the race is leaving stray
> 'hotplug-status' xenstore node - not ideal, but also less harmful than
> failing to bring up an interface. At this point, the toolstack could cleanup
> it later, perhaps while setting up that interface again (if it gets
> re-connected)?
> 
> Anyway, perhaps the best thing to do now, is to revert both commits, and
> think of an alternative solution for the original issue? That of course
> assumes I guessed correctly why it was done in the first place...
> 

Simply reverting everything would likely break the ability to do unbind and 
bind (which is useful e.g to allow update the netback module whilst guests are 
still running) so I don't think that's an option.

  Paul


RE: [PATCH] xen-netback: Check for hotplug-status existence before watching

2021-05-11 Thread Durrant, Paul
> -Original Message-
> From: Marek Marczykowski-Górecki 
> Sent: 11 May 2021 11:45
> To: Durrant, Paul 
> Cc: Michael Brown ; p...@xen.org; 
> xen-devel@lists.xenproject.org;
> net...@vger.kernel.org; wei@kernel.org
> Subject: RE: [EXTERNAL] [PATCH] xen-netback: Check for hotplug-status 
> existence before watching
> 
> On Tue, May 11, 2021 at 12:40:54PM +0200, Marek Marczykowski-Górecki wrote:
> > On Tue, May 11, 2021 at 07:06:55AM +, Durrant, Paul wrote:
> > > > -Original Message-
> > > > From: Marek Marczykowski-Górecki 
> > > > Sent: 10 May 2021 20:43
> > > > To: Michael Brown ; p...@xen.org
> > > > Cc: p...@xen.org; xen-devel@lists.xenproject.org; 
> > > > net...@vger.kernel.org; wei@kernel.org;
> Durrant,
> > > > Paul 
> > > > Subject: RE: [EXTERNAL] [PATCH] xen-netback: Check for hotplug-status 
> > > > existence before watching
> > > >
> > > > On Mon, May 10, 2021 at 08:06:55PM +0100, Michael Brown wrote:
> > > > > If you have a suggested patch, I'm happy to test that it doesn't 
> > > > > reintroduce
> > > > > the regression bug that was fixed by this commit.
> > > >
> > > > Actually, I've just tested with a simple reloading xen-netfront module. 
> > > > It
> > > > seems in this case, the hotplug script is not re-executed. In fact, I
> > > > think it should not be re-executed at all, since the vif interface
> > > > remains in place (it just gets NO-CARRIER flag).
> > > >
> > > > This brings a question, why removing hotplug-status in the first place?
> > > > The interface remains correctly configured by the hotplug script after
> > > > all. From the commit message:
> > > >
> > > > xen-netback: remove 'hotplug-status' once it has served its purpose
> > > >
> > > > Removing the 'hotplug-status' node in netback_remove() is wrong; 
> > > > the script
> > > > may not have completed. Only remove the node once the watch has 
> > > > fired and
> > > > has been unregistered.
> > > >
> > > > I think the intention was to remove 'hotplug-status' node _later_ in
> > > > case of quickly adding and removing the interface. Is that right, Paul?
> > >
> > > The removal was done to allow unbind/bind to function correctly. IIRC 
> > > before the original patch
> doing a bind would stall forever waiting for the hotplug status to change, 
> which would never happen.
> >
> > Hmm, in that case maybe don't remove it at all in the backend, and let
> > it be cleaned up by the toolstack, when it removes other backend-related
> > nodes?
> 
> No, unbind/bind _does_ require hotplug script to be called again.
> 

Yes, sorry I was misremembering. My memory is hazy but there was definitely a 
problem at the time with leaving the node in place.

> When exactly vif interface appears in the system (starts to be available
> for the hotplug script)? Maybe remove 'hotplug-status' just before that
> point?
> 

I really can't remember any detail. Perhaps try reverting both patches then and 
check that the unbind/rmmod/modprobe/bind sequence still works (and the backend 
actually makes it into connected state).

  Paul



Re: Ping: [PATCH] x86/HVM: drop dead assignments from hvmemul_rep_{movs,stos}()

2023-08-03 Thread Durrant, Paul

On 03/08/2023 12:46, Jan Beulich wrote:

On 27.07.2023 20:41, Stefano Stabellini wrote:

On Thu, 27 Jul 2023, Jan Beulich wrote:

In the latter case the variable altogether is then unused and hence gets
dropped, eliminating a Misra Rule 5.3 violation. I'm afraid I mistakenly
introduced both assignments in 57a57465daaf ("x86/HVM: use available
linear->phys translations in REP MOVS/STOS handling"), likely as a
result of some re-work on the patch.

Signed-off-by: Jan Beulich 


Reviewed-by: Stefano Stabellini 


Paul - any chance of an ack?



Sure.

Reviewed-by: Paul Durrant 





Re: [PATCH] ns16550c: avoid crash in ns16550_endboot in PV shim mode

2023-10-20 Thread Durrant, Paul

On 20/10/2023 14:37, David Woodhouse wrote:
[snip]


[0] 
https://elixir.bootlin.com/linux/latest/source/drivers/tty/hvc/hvc_xen.c#L258


I'm not convinced I believe what the comment says there about evtchn 0
being theoretically valid. I don't believe zero is a valid evtchn#, is
it?


gfn 0 might be valid, but I'm also pretty sure evtchn 0 is not valid.

  Paul



Re: [PATCH v3 05/28] hw/xen: fix XenStore watch delivery to guest

2023-10-27 Thread Durrant, Paul

On 25/10/2023 15:50, David Woodhouse wrote:

From: David Woodhouse 

When fire_watch_cb() found the response buffer empty, it would call
deliver_watch() to generate the XS_WATCH_EVENT message in the response
buffer and send an event channel notification to the guest… without
actually *copying* the response buffer into the ring. So there was
nothing for the guest to see. The pending response didn't actually get
processed into the ring until the guest next triggered some activity
from its side.

Add the missing call to put_rsp().

It might have been slightly nicer to call xen_xenstore_event() here,
which would *almost* have worked. Except for the fact that it calls
xen_be_evtchn_pending() to check that it really does have an event
pending (and clear the eventfd for next time). And under Xen it's
defined that setting that fd to O_NONBLOCK isn't guaranteed to work,
so the emu implementation follows suit.

This fixes Xen device hot-unplug.

Fixes: 0254c4d19df ("hw/xen: Add xenstore wire implementation and implementation 
stubs")
Signed-off-by: David Woodhouse 
---
  hw/i386/kvm/xen_xenstore.c | 8 +---
  1 file changed, 5 insertions(+), 3 deletions(-)



Reviewed-by: Paul Durrant 




Re: [PATCH v3 06/28] hw/xen: take iothread mutex in xen_evtchn_reset_op()

2023-10-27 Thread Durrant, Paul

On 25/10/2023 15:50, David Woodhouse wrote:

From: David Woodhouse 

The xen_evtchn_soft_reset() function requires the iothread mutex, but is
also called for the EVTCHNOP_reset hypercall. Ensure the mutex is taken
in that case.

Fixes: a15b10978fe6 ("hw/xen: Implement EVTCHNOP_reset")
Signed-off-by: David Woodhouse 
---
  hw/i386/kvm/xen_evtchn.c | 1 +
  1 file changed, 1 insertion(+)



Reviewed-by: Paul Durrant 




Re: [PATCH v3 07/28] hw/xen: use correct default protocol for xen-block on x86

2023-10-27 Thread Durrant, Paul

On 25/10/2023 15:50, David Woodhouse wrote:

From: David Woodhouse 

Even on x86_64 the default protocol is the x86-32 one if the guest doesn't
specifically ask for x86-64.

Fixes: b6af8926fb85 ("xen: add implementations of xen-block connect and disconnect 
functions...")
Signed-off-by: David Woodhouse 
---
  hw/block/xen-block.c | 6 +-
  1 file changed, 5 insertions(+), 1 deletion(-)



Reviewed-by: Paul Durrant 




Re: [PATCH v3 08/28] i386/xen: Ignore VCPU_SSHOTTMR_future flag in set_singleshot_timer()

2023-10-27 Thread Durrant, Paul

On 25/10/2023 15:50, David Woodhouse wrote:

From: David Woodhouse 

Upstream Xen now ignores this flag¹, since the only guest kernel ever to
use it was buggy.

¹ https://xenbits.xen.org/gitweb/?p=xen.git;a=commitdiff;h=19c6cbd909

Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
  target/i386/kvm/xen-emu.c | 20 ++--
  1 file changed, 10 insertions(+), 10 deletions(-)



Reviewed-by: Paul Durrant 




Re: [PATCH v3 13/28] hw/xen: automatically assign device index to block devices

2023-10-27 Thread Durrant, Paul

On 25/10/2023 15:50, David Woodhouse wrote:

From: David Woodhouse 

There's no need to force the user to assign a vdev. We can automatically
assign one, starting at xvda and searching until we find the first disk
name that's unused.

This means we can now allow '-drive if=xen,file=xxx' to work without an
explicit separate -driver argument, just like if=virtio.

Rip out the legacy handling from the xenpv machine, which was scribbling
over any disks configured by the toolstack, and didn't work with anything
but raw images.

Signed-off-by: David Woodhouse 
Acked-by: Kevin Wolf 
---
  blockdev.c  | 15 +---
  hw/block/xen-block.c| 38 +
  hw/xen/xen_devconfig.c  | 28 -
  hw/xenpv/xen_machine_pv.c   |  9 ---
  include/hw/xen/xen-legacy-backend.h |  1 -
  5 files changed, 50 insertions(+), 41 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index a01c62596b..5d9f2e5395 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -255,13 +255,13 @@ void drive_check_orphaned(void)
   * Ignore default drives, because we create certain default
   * drives unconditionally, then leave them unclaimed.  Not the
   * users fault.
- * Ignore IF_VIRTIO, because it gets desugared into -device,
- * so we can leave failing to -device.
+ * Ignore IF_VIRTIO or IF_XEN, because it gets desugared into
+ * -device, so we can leave failing to -device.
   * Ignore IF_NONE, because leaving unclaimed IF_NONE remains
   * available for device_add is a feature.
   */
  if (dinfo->is_default || dinfo->type == IF_VIRTIO
-|| dinfo->type == IF_NONE) {
+|| dinfo->type == IF_XEN || dinfo->type == IF_NONE) {
  continue;
  }
  if (!blk_get_attached_dev(blk)) {
@@ -977,6 +977,15 @@ DriveInfo *drive_new(QemuOpts *all_opts, 
BlockInterfaceType block_default_type,
  qemu_opt_set(devopts, "driver", "virtio-blk", &error_abort);
  qemu_opt_set(devopts, "drive", qdict_get_str(bs_opts, "id"),
   &error_abort);
+} else if (type == IF_XEN) {
+QemuOpts *devopts;
+devopts = qemu_opts_create(qemu_find_opts("device"), NULL, 0,
+   &error_abort);
+qemu_opt_set(devopts, "driver",
+ (media == MEDIA_CDROM) ? "xen-cdrom" : "xen-disk",
+ &error_abort);
+qemu_opt_set(devopts, "drive", qdict_get_str(bs_opts, "id"),
+ &error_abort);
  }
  
  filename = qemu_opt_get(legacy_opts, "file");

diff --git a/hw/block/xen-block.c b/hw/block/xen-block.c
index 64470fc579..5011fe9430 100644
--- a/hw/block/xen-block.c
+++ b/hw/block/xen-block.c
@@ -27,6 +27,7 @@
  #include "sysemu/block-backend.h"
  #include "sysemu/iothread.h"
  #include "dataplane/xen-block.h"
+#include "hw/xen/interface/io/xs_wire.h"
  #include "trace.h"
  
  static char *xen_block_get_name(XenDevice *xendev, Error **errp)

@@ -34,6 +35,43 @@ static char *xen_block_get_name(XenDevice *xendev, Error 
**errp)
  XenBlockDevice *blockdev = XEN_BLOCK_DEVICE(xendev);
  XenBlockVdev *vdev = &blockdev->props.vdev;
  
+if (blockdev->props.vdev.type == XEN_BLOCK_VDEV_TYPE_INVALID) {

+XenBus *xenbus = XEN_BUS(qdev_get_parent_bus(DEVICE(xendev)));
+char fe_path[XENSTORE_ABS_PATH_MAX + 1];
+char *value;
+int disk = 0;
+unsigned long idx;
+
+/* Find an unoccupied device name */


Not sure this is going to work is it? What happens if 'hda' or 'sda', or 
'd0' exists? I think you need to use the core of the code in 
xen_block_set_vdev() to generate names and search all possible encodings 
for each disk.


  Paul


+while (disk < (1 << 20)) {
+if (disk < (1 << 4)) {
+idx = (202 << 8) | (disk << 4);
+} else {
+idx = (1 << 28) | (disk << 8);
+}
+snprintf(fe_path, sizeof(fe_path),
+ "/local/domain/%u/device/vbd/%lu",
+ xendev->frontend_id, idx);
+value = qemu_xen_xs_read(xenbus->xsh, XBT_NULL, fe_path, NULL);
+if (!value) {
+if (errno == ENOENT) {
+vdev->type = XEN_BLOCK_VDEV_TYPE_XVD;
+vdev->partition = 0;
+vdev->disk = disk;
+vdev->number = idx;
+goto found;
+}
+error_setg(errp, "cannot read %s: %s", fe_path,
+   strerror(errno));
+return NULL;
+}
+free(value);
+disk++;
+}
+error_setg(errp, "cannot find device vdev for block device");
+return NULL;
+}
+ found:
  return g_strdup_printf("%lu", vdev->number);
  }
  
diff --git a/hw/xen/xen_devconfig.c b/hw/xen/xen_

Re: [PATCH v3 14/28] hw/xen: add get_frontend_path() method to XenDeviceClass

2023-10-27 Thread Durrant, Paul

On 25/10/2023 15:50, David Woodhouse wrote:

From: David Woodhouse 

The primary Xen console is special. The guest's side is set up for it by
the toolstack automatically and not by the standard PV init sequence.

Accordingly, its *frontend* doesn't appear in …/device/console/0 either;
instead it appears under …/console in the guest's XenStore node.

To allow the Xen console driver to override the frontend path for the
primary console, add a method to the XenDeviceClass which can be used
instead of the standard xen_device_get_frontend_path()

Signed-off-by: David Woodhouse 
---
  hw/xen/xen-bus.c | 11 ++-
  include/hw/xen/xen-bus.h |  2 ++
  2 files changed, 12 insertions(+), 1 deletion(-)



Reviewed-by: Paul Durrant 




Re: [PATCH v3 17/28] hw/xen: add support for Xen primary console in emulated mode

2023-10-27 Thread Durrant, Paul

On 25/10/2023 15:50, David Woodhouse wrote:

From: David Woodhouse 

The primary console is special because the toolstack maps a page into
the guest for its ring, and also allocates the guest-side event channel.
The guest's grant table is even primed to export that page using a known
grant ref#. Add support for all that in emulated mode, so that we can
have a primary console.

For reasons unclear, the backends running under real Xen don't just use
a mapping of the well-known GNTTAB_RESERVED_CONSOLE grant ref (which
would also be in the ring-ref node in XenStore). Instead, the toolstack
sets the ring-ref node of the primary console to the GFN of the guest
page. The backend is expected to handle that special case and map it
with foreignmem operations instead.

We don't have an implementation of foreignmem ops for emulated Xen mode,
so just make it map GNTTAB_RESERVED_CONSOLE instead. This would probably
work for real Xen too, but we can't work out how to make real Xen create
a primary console of type "ioemu" to make QEMU drive it, so we can't
test that; might as well leave it as it is for now under Xen.

Now at last we can boot the Xen PV shim and run PV kernels in QEMU.

Signed-off-by: David Woodhouse 
---
  hw/char/xen_console.c |  78 
  hw/i386/kvm/meson.build   |   1 +
  hw/i386/kvm/trace-events  |   2 +
  hw/i386/kvm/xen-stubs.c   |   8 ++
  hw/i386/kvm/xen_gnttab.c  |   7 +-
  hw/i386/kvm/xen_primary_console.c | 193 ++
  hw/i386/kvm/xen_primary_console.h |  23 
  hw/i386/kvm/xen_xenstore.c|  10 ++
  hw/xen/xen-bus.c  |   5 +
  include/hw/xen/xen-bus.h  |   1 +
  target/i386/kvm/xen-emu.c |  23 +++-
  11 files changed, 328 insertions(+), 23 deletions(-)
  create mode 100644 hw/i386/kvm/xen_primary_console.c
  create mode 100644 hw/i386/kvm/xen_primary_console.h



Reviewed-by: Paul Durrant 





Re: [PATCH v3 18/28] hw/xen: only remove peers of PCI NICs on unplug

2023-10-27 Thread Durrant, Paul

On 25/10/2023 15:50, David Woodhouse wrote:

From: David Woodhouse 

When the Xen guest asks to unplug *emulated* NICs, it's kind of unhelpful
also to unplug the peer of the *Xen* PV NIC.

Signed-off-by: David Woodhouse 
---
  hw/i386/xen/xen_platform.c | 9 +++--
  1 file changed, 7 insertions(+), 2 deletions(-)



Reviewed-by: Paul Durrant 



Re: [PATCH v3 19/28] hw/xen: update Xen PV NIC to XenDevice model

2023-10-27 Thread Durrant, Paul

On 25/10/2023 15:50, David Woodhouse wrote:

From: David Woodhouse 

This allows us to use Xen PV networking with emulated Xen guests, and to
add them on the command line or hotplug.

Signed-off-by: David Woodhouse 
---
  hw/net/meson.build|   2 +-
  hw/net/trace-events   |  11 +
  hw/net/xen_nic.c  | 484 +-
  hw/xenpv/xen_machine_pv.c |   1 -
  4 files changed, 381 insertions(+), 117 deletions(-)

diff --git a/hw/net/meson.build b/hw/net/meson.build
index 2632634df3..f64651c467 100644
--- a/hw/net/meson.build
+++ b/hw/net/meson.build
@@ -1,5 +1,5 @@
  system_ss.add(when: 'CONFIG_DP8393X', if_true: files('dp8393x.c'))
-system_ss.add(when: 'CONFIG_XEN', if_true: files('xen_nic.c'))
+system_ss.add(when: 'CONFIG_XEN_BUS', if_true: files('xen_nic.c'))
  system_ss.add(when: 'CONFIG_NE2000_COMMON', if_true: files('ne2000.c'))
  
  # PCI network cards

diff --git a/hw/net/trace-events b/hw/net/trace-events
index 3abfd65e5b..3097742cc0 100644
--- a/hw/net/trace-events
+++ b/hw/net/trace-events
@@ -482,3 +482,14 @@ dp8393x_receive_oversize(int size) "oversize packet, pkt_size 
is %d"
  dp8393x_receive_not_netcard(void) "packet not for netcard"
  dp8393x_receive_packet(int crba) "Receive packet at 0x%"PRIx32
  dp8393x_receive_write_status(int crba) "Write status at 0x%"PRIx32
+
+# xen_nic.c
+xen_netdev_realize(int dev, const char *info, const char *peer) "vif%u info '%s' 
peer '%s'"
+xen_netdev_unrealize(int dev) "vif%u"
+xen_netdev_create(int dev) "vif%u"
+xen_netdev_destroy(int dev) "vif%u"
+xen_netdev_disconnect(int dev) "vif%u"
+xen_netdev_connect(int dev, unsigned int tx, unsigned int rx, int port) "vif%u tx 
%u rx %u port %u"
+xen_netdev_frontend_changed(const char *dev, int state) "vif%s state %d"
+xen_netdev_tx(int dev, int ref, int off, int len, unsigned int flags, const char *c, 
const char *d, const char *m, const char *e) "vif%u ref %u off %u len %u flags 
0x%x%s%s%s%s"
+xen_netdev_rx(int dev, int idx, int status, int flags) "vif%u idx %d status %d 
flags 0x%x"
diff --git a/hw/net/xen_nic.c b/hw/net/xen_nic.c
index 9bbf6599fc..af4ba3f1e6 100644
--- a/hw/net/xen_nic.c
+++ b/hw/net/xen_nic.c
@@ -20,6 +20,13 @@
   */
  
  #include "qemu/osdep.h"

+#include "qemu/main-loop.h"
+#include "qemu/cutils.h"
+#include "qemu/log.h"
+#include "qemu/qemu-print.h"
+#include "qapi/qmp/qdict.h"
+#include "qapi/error.h"
+
  #include 
  #include 
  #include 
@@ -27,18 +34,26 @@
  #include "net/net.h"
  #include "net/checksum.h"
  #include "net/util.h"
-#include "hw/xen/xen-legacy-backend.h"
+
+#include "hw/xen/xen-backend.h"
+#include "hw/xen/xen-bus-helper.h"
+#include "hw/qdev-properties.h"
+#include "hw/qdev-properties-system.h"
  
  #include "hw/xen/interface/io/netif.h"

+#include "hw/xen/interface/io/xs_wire.h"
+
+#include "trace.h"
  
  /* - */
  
  struct XenNetDev {

-struct XenLegacyDevice  xendev;  /* must be first */
-char  *mac;
+struct XenDevice  xendev;  /* must be first */
+XenEventChannel   *event_channel;
+int   dev;
  int   tx_work;
-int   tx_ring_ref;
-int   rx_ring_ref;
+unsigned int  tx_ring_ref;
+unsigned int  rx_ring_ref;
  struct netif_tx_sring *txs;
  struct netif_rx_sring *rxs;
  netif_tx_back_ring_t  tx_ring;
@@ -47,6 +62,11 @@ struct XenNetDev {
  NICState  *nic;
  };
  
+typedef struct XenNetDev XenNetDev;

+
+#define TYPE_XEN_NET_DEVICE "xen-net-device"
+OBJECT_DECLARE_SIMPLE_TYPE(XenNetDev, XEN_NET_DEVICE)
+
  /* - */
  
  static void net_tx_response(struct XenNetDev *netdev, netif_tx_request_t *txp, int8_t st)

@@ -68,7 +88,8 @@ static void net_tx_response(struct XenNetDev *netdev, 
netif_tx_request_t *txp, i
  netdev->tx_ring.rsp_prod_pvt = ++i;
  RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netdev->tx_ring, notify);
  if (notify) {
-xen_pv_send_notify(&netdev->xendev);
+xen_device_notify_event_channel(XEN_DEVICE(netdev),
+netdev->event_channel, NULL);
  }
  
  if (i == netdev->tx_ring.req_cons) {

@@ -104,13 +125,16 @@ static void net_tx_error(struct XenNetDev *netdev, 
netif_tx_request_t *txp, RING
  #endif
  }
  
-static void net_tx_packets(struct XenNetDev *netdev)

+static bool net_tx_packets(struct XenNetDev *netdev)
  {
+bool done_something = false;
  netif_tx_request_t txreq;
  RING_IDX rc, rp;
  void *page;
  void *tmpbuf = NULL;
  
+assert(qemu_mutex_iothread_locked());

+
  for (;;) {
  rc = netdev->tx_ring.req_cons;
  rp = netdev->tx_ring.sring->req_prod;
@@ -122,49 +146,52 @@ static void net_tx_packets(struct XenNetDev *netdev)
  }
  memcpy(&txreq, RING_GET_REQUEST(&netdev->tx_ring, rc), 
sizeof(

<    1   2   3   4   5   >