from:"Tamas K Lengyel"

[Xen-devel] [PATCH v13 2/3] x86/mem_sharing: reset a fork

2020-03-30 Thread Tamas K Lengyel

Implement hypercall that allows a fork to shed all memory that got allocated
for it during its execution and re-load its vCPU context from the parent VM.
This allows the forked VM to reset into the same state the parent VM is in a
faster way then creating a new fork would be. Measurements show about a 2x
speedup during normal fuzzing operations. Performance may vary depending how
much memory got allocated for the forked VM. If it has been completely
deduplicated from the parent VM then creating a new fork would likely be more
performant.

Signed-off-by: Tamas K Lengyel 
Reviewed-by: Roger Pau Monné 
---
v12: remove continuation & add comment back
 address style issues pointed out by Jan
---
 xen/arch/x86/mm/mem_sharing.c | 77 +++
 xen/include/public/memory.h   |  1 +
 2 files changed, 78 insertions(+)

diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
index faa79011c3..cc09f9c84f 100644
--- a/xen/arch/x86/mm/mem_sharing.c
+++ b/xen/arch/x86/mm/mem_sharing.c
@@ -1758,6 +1758,61 @@ static int fork(struct domain *cd, struct domain *d)
 return rc;
 }
 
+/*
+ * The fork reset operation is intended to be used on short-lived forks only.
+ * There is no hypercall continuation operation implemented for this reason.
+ * For forks that obtain a larger memory footprint it is likely going to be
+ * more performant to create a new fork instead of resetting an existing one.
+ *
+ * TODO: In case this hypercall would become useful on forks with larger memory
+ * footprints the hypercall continuation should be implemented (or if this
+ * feature needs to be become "stable").
+ */
+static int mem_sharing_fork_reset(struct domain *d, struct domain *pd)
+{
+int rc;
+struct p2m_domain *p2m = p2m_get_hostp2m(d);
+struct page_info *page, *tmp;
+
+domain_pause(d);
+
+/* need recursive lock because we will free pages */
+spin_lock_recursive(>page_alloc_lock);
+page_list_for_each_safe(page, tmp, >page_list)
+{
+p2m_type_t p2mt;
+p2m_access_t p2ma;
+mfn_t mfn = page_to_mfn(page);
+gfn_t gfn = mfn_to_gfn(d, mfn);
+
+mfn = __get_gfn_type_access(p2m, gfn_x(gfn), , ,
+0, NULL, false);
+
+/* only reset pages that are sharable */
+if ( !p2m_is_sharable(p2mt) )
+continue;
+
+/* take an extra reference or just skip if can't for whatever reason */
+if ( !get_page(page, d) )
+continue;
+
+/* forked memory is 4k, not splitting large pages so this must work */
+rc = p2m->set_entry(p2m, gfn, INVALID_MFN, PAGE_ORDER_4K,
+p2m_invalid, p2m_access_rwx, -1);
+ASSERT(!rc);
+
+put_page_alloc_ref(page);
+put_page(page);
+}
+spin_unlock_recursive(>page_alloc_lock);
+
+rc = copy_settings(d, pd);
+
+domain_unpause(d);
+
+return rc;
+}
+
 int mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg)
 {
 int rc;
@@ -2048,6 +2103,28 @@ int 
mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg)
 break;
 }
 
+case XENMEM_sharing_op_fork_reset:
+{
+struct domain *pd;
+
+rc = -EINVAL;
+if ( mso.u.fork.pad[0] || mso.u.fork.pad[1] || mso.u.fork.pad[2] )
+goto out;
+
+rc = -ENOSYS;
+if ( !d->parent )
+goto out;
+
+rc = rcu_lock_live_remote_domain_by_id(d->parent->domain_id, );
+if ( rc )
+goto out;
+
+rc = mem_sharing_fork_reset(d, pd);
+
+rcu_unlock_domain(pd);
+break;
+}
+
 default:
 rc = -ENOSYS;
 break;
diff --git a/xen/include/public/memory.h b/xen/include/public/memory.h
index 5ee4e0da12..d36d64b8dc 100644
--- a/xen/include/public/memory.h
+++ b/xen/include/public/memory.h
@@ -483,6 +483,7 @@ DEFINE_XEN_GUEST_HANDLE(xen_mem_access_op_t);
 #define XENMEM_sharing_op_audit 7
 #define XENMEM_sharing_op_range_share   8
 #define XENMEM_sharing_op_fork  9
+#define XENMEM_sharing_op_fork_reset10
 
 #define XENMEM_SHARING_OP_S_HANDLE_INVALID  (-10)
 #define XENMEM_SHARING_OP_C_HANDLE_INVALID  (-9)
-- 
2.20.1

[Xen-devel] [PATCH v13 1/3] xen/mem_sharing: VM forking

2020-03-30 Thread Tamas K Lengyel

VM forking is the process of creating a domain with an empty memory space and a
parent domain specified from which to populate the memory when necessary. For
the new domain to be functional the VM state is copied over as part of the fork
operation (HVM params, hap allocation, etc).

Signed-off-by: Tamas K Lengyel 
Acked-by: Jan Beulich 
---
v13: Address issues pointed out by Roger & Jan
 Introduce & use PAGE_OFFSET to calculate vcpu_info offset
---
 xen/arch/x86/domain.c |  13 ++
 xen/arch/x86/hvm/hvm.c|   4 +-
 xen/arch/x86/mm/hap/hap.c |   3 +-
 xen/arch/x86/mm/mem_sharing.c | 351 ++
 xen/arch/x86/mm/p2m.c |   9 +-
 xen/include/asm-arm/page.h|   1 +
 xen/include/asm-x86/hap.h |   1 +
 xen/include/asm-x86/hvm/hvm.h |   2 +
 xen/include/asm-x86/mem_sharing.h |  18 ++
 xen/include/asm-x86/page.h|   1 +
 xen/include/public/memory.h   |   5 +
 xen/include/xen/sched.h   |   1 +
 12 files changed, 404 insertions(+), 5 deletions(-)

diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index 683bc619aa..a008d7df1c 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -2211,6 +2211,19 @@ int domain_relinquish_resources(struct domain *d)
 ret = relinquish_shared_pages(d);
 if ( ret )
 return ret;
+
+/*
+ * If the domain is forked, decrement the parent's pause count
+ * and release the domain.
+ */
+if ( mem_sharing_is_fork(d) )
+{
+struct domain *parent = d->parent;
+
+d->parent = NULL;
+domain_unpause(parent);
+put_domain(parent);
+}
 }
 #endif
 
diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index a3d115b650..304b3d1562 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -1917,7 +1917,7 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long 
gla,
 }
 #endif
 
-/* Spurious fault? PoD and log-dirty also take this path. */
+/* Spurious fault? PoD, log-dirty and VM forking also take this path. */
 if ( p2m_is_ram(p2mt) )
 {
 rc = 1;
@@ -4377,7 +4377,7 @@ static int hvm_allow_get_param(struct domain *d,
 return rc;
 }
 
-static int hvm_get_param(struct domain *d, uint32_t index, uint64_t *value)
+int hvm_get_param(struct domain *d, uint32_t index, uint64_t *value)
 {
 int rc;
 
diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c
index a6d5e39b02..814d0c3253 100644
--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -321,8 +321,7 @@ static void hap_free_p2m_page(struct domain *d, struct 
page_info *pg)
 }
 
 /* Return the size of the pool, rounded up to the nearest MB */
-static unsigned int
-hap_get_allocation(struct domain *d)
+unsigned int hap_get_allocation(struct domain *d)
 {
 unsigned int pg = d->arch.paging.hap.total_pages
 + d->arch.paging.hap.p2m_pages;
diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
index f49f27a3ef..faa79011c3 100644
--- a/xen/arch/x86/mm/mem_sharing.c
+++ b/xen/arch/x86/mm/mem_sharing.c
@@ -22,6 +22,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -36,6 +37,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 #include 
 
 #include "mm-locks.h"
@@ -1443,6 +1446,318 @@ static inline int mem_sharing_control(struct domain *d, 
bool enable)
 return 0;
 }
 
+/*
+ * Forking a page only gets called when the VM faults due to no entry being
+ * in the EPT for the access. Depending on the type of access we either
+ * populate the physmap with a shared entry for read-only access or
+ * fork the page if its a write access.
+ *
+ * The client p2m is already locked so we only need to lock
+ * the parent's here.
+ */
+int mem_sharing_fork_page(struct domain *d, gfn_t gfn, bool unsharing)
+{
+int rc = -ENOENT;
+shr_handle_t handle;
+struct domain *parent = d->parent;
+struct p2m_domain *p2m;
+unsigned long gfn_l = gfn_x(gfn);
+mfn_t mfn, new_mfn;
+p2m_type_t p2mt;
+struct page_info *page;
+
+if ( !mem_sharing_is_fork(d) )
+return -ENOENT;
+
+if ( !unsharing )
+{
+/* For read-only accesses we just add a shared entry to the physmap */
+while ( parent )
+{
+if ( !(rc = nominate_page(parent, gfn, 0, )) )
+break;
+
+parent = parent->parent;
+}
+
+if ( !rc )
+{
+/* The client's p2m is already locked */
+p2m = p2m_get_hostp2m(parent);
+
+p2m_lock(p2m);
+rc = add_to_physmap(parent, gfn_l, handle, d, gfn_l, false);
+p2m_unlock(p2m);
+
+if ( !rc )
+return 0;
+}
+}
+
+/*
+ * If it's a write access (ie. unsharing) or if adding a shared

Re: [Xen-devel] [PATCH v12 2/3] x86/mem_sharing: reset a fork

2020-03-26 Thread Tamas K Lengyel

On Thu, Mar 26, 2020 at 4:17 AM Jan Beulich  wrote:
>
> On 23.03.2020 18:04, Tamas K Lengyel wrote:
> > +static int mem_sharing_fork_reset(struct domain *d, struct domain *pd)
> > +{
> > +int rc;
> > +struct p2m_domain *p2m = p2m_get_hostp2m(d);
> > +struct page_info *page, *tmp;
> > +
> > +spin_lock(>page_alloc_lock);
> > +domain_pause(d);
>
> Why do you take the lock first?

No particular reason - does the order matter?

Tamas

Re: [Xen-devel] [PATCH v12 1/3] xen/mem_sharing: VM forking

2020-03-26 Thread Tamas K Lengyel

On Thu, Mar 26, 2020 at 6:33 AM Jan Beulich  wrote:
>
> On 23.03.2020 18:04, Tamas K Lengyel wrote:
> > --- a/xen/arch/x86/domain.c
> > +++ b/xen/arch/x86/domain.c
> > @@ -2202,6 +2202,17 @@ int domain_relinquish_resources(struct domain *d)
> >  ret = relinquish_shared_pages(d);
> >  if ( ret )
> >  return ret;
> > +
> > +/*
> > + * If the domain is forked, decrement the parent's pause count
> > + * and release the domain.
> > + */
> > +if ( mem_sharing_is_fork(d) )
> > +{
> > +domain_unpause(d->parent);
> > +put_domain(d->parent);
> > +d->parent = NULL;
>
> I think you want to clear the field before putting the reference,
> to make sure possible readers of it won't see it non-NULL when
> the domain is already being cleaned up, or even gone.

Sure.

>
> With this, applicable parts of the change
> Acked-by: Jan Beulich 
>
> I'll try to keep an eye on when you and Roger have settled on the
> remaining aspects, to determine when this (probably v13) can be
> committed.

Thanks!

>
> > --- a/xen/include/asm-x86/mem_sharing.h
> > +++ b/xen/include/asm-x86/mem_sharing.h
> > @@ -77,6 +77,14 @@ static inline int mem_sharing_unshare_page(struct domain 
> > *d,
> >  return rc;
> >  }
> >
> > +static inline bool mem_sharing_is_fork(struct domain *d)
>
> const? (then also in the stub further down)

Sure.

Tamas

Re: [Xen-devel] [PATCH v12 2/3] x86/mem_sharing: reset a fork

2020-03-26 Thread Tamas K Lengyel

On Thu, Mar 26, 2020 at 8:52 AM Jan Beulich  wrote:
>
> On 26.03.2020 15:48, Tamas K Lengyel wrote:
> > On Thu, Mar 26, 2020 at 4:17 AM Jan Beulich  wrote:
> >>
> >> On 23.03.2020 18:04, Tamas K Lengyel wrote:
> >>> +static int mem_sharing_fork_reset(struct domain *d, struct domain *pd)
> >>> +{
> >>> +int rc;
> >>> +struct p2m_domain *p2m = p2m_get_hostp2m(d);
> >>> +struct page_info *page, *tmp;
> >>> +
> >>> +spin_lock(>page_alloc_lock);
> >>> +domain_pause(d);
> >>
> >> Why do you take the lock first?
> >
> > No particular reason - does the order matter?
>
> I think you'd better avoid holding a lock for extended periods
> of time. And what's perhaps worse, what if a vCPU of the domain
> sits in Xen trying to acquire this lock - you'd deadlock trying
> to pause the domain then.

OK, I'll invert them order then.

Thanks,
Tamas

[PATCH v2] mem_sharing: map shared_info page to same gfn during fork

2020-04-28 Thread Tamas K Lengyel

During a VM fork we copy the shared_info page; however, we also need to ensure
that the page is mapped into the same GFN in the fork as its in the parent.

Signed-off-by: Tamas K Lengyel 
Suggested-by: Roger Pau Monne 
---
 xen/arch/x86/mm/mem_sharing.c | 25 +
 1 file changed, 25 insertions(+)

diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
index 344a5bfb3d..a1dea8fedb 100644
--- a/xen/arch/x86/mm/mem_sharing.c
+++ b/xen/arch/x86/mm/mem_sharing.c
@@ -1656,6 +1656,7 @@ static void copy_tsc(struct domain *cd, struct domain *d)
 static int copy_special_pages(struct domain *cd, struct domain *d)
 {
 mfn_t new_mfn, old_mfn;
+gfn_t new_gfn, old_gfn;
 struct p2m_domain *p2m = p2m_get_hostp2m(cd);
 static const unsigned int params[] =
 {
@@ -1701,6 +1702,30 @@ static int copy_special_pages(struct domain *cd, struct 
domain *d)
 new_mfn = _mfn(virt_to_mfn(cd->shared_info));
 copy_domain_page(new_mfn, old_mfn);
 
+old_gfn = _gfn(get_gpfn_from_mfn(mfn_x(old_mfn)));
+new_gfn = _gfn(get_gpfn_from_mfn(mfn_x(new_mfn)));
+
+if ( !gfn_eq(old_gfn, new_gfn) )
+{
+if ( !gfn_eq(new_gfn, INVALID_GFN) )
+{
+/* if shared_info is mapped to a different gfn just remove it */
+rc = p2m->set_entry(p2m, new_gfn, INVALID_MFN, PAGE_ORDER_4K,
+p2m_invalid, p2m->default_access, -1);
+if ( rc )
+return rc;
+}
+
+if ( !gfn_eq(old_gfn, INVALID_GFN) )
+{
+/* now map it to the same gfn as the parent */
+rc = p2m->set_entry(p2m, old_gfn, new_mfn, PAGE_ORDER_4K,
+p2m_ram_rw, p2m->default_access, -1);
+if ( rc )
+return rc;
+}
+}
+
 return 0;
 }
 
-- 
2.20.1

Re: [PATCH v17 1/2] mem_sharing: fix sharability check during fork reset

2020-04-25 Thread Tamas K Lengyel

On Sat, Apr 25, 2020 at 3:01 AM Roger Pau Monné  wrote:
>
> On Fri, Apr 24, 2020 at 06:18:24AM -0600, Tamas K Lengyel wrote:
> > On Fri, Apr 24, 2020 at 3:44 AM Roger Pau Monné  
> > wrote:
> > >
> > > On Thu, Apr 23, 2020 at 08:30:06AM -0700, Tamas K Lengyel wrote:
> > > > When resetting a VM fork we ought to only remove pages that were 
> > > > allocated for
> > > > the fork during it's execution and the contents copied over from the 
> > > > parent.
> > > > This can be determined if the page is sharable as special pages used by 
> > > > the
> > > > fork for other purposes will not pass this test. Unfortunately during 
> > > > the fork
> > > > reset loop we only partially check whether that's the case. A page's 
> > > > type may
> > > > indicate it is sharable (pass p2m_is_sharable) but that's not a 
> > > > sufficient
> > > > check by itself. All checks that are normally performed before a page is
> > > > converted to the sharable type need to be performed to avoid removing 
> > > > pages
> > > > from the p2m that may be used for other purposes. For example, 
> > > > currently the
> > > > reset loop also removes the vcpu info pages from the p2m, potentially 
> > > > putting
> > > > the guest into infinite page-fault loops.
> > > >
> > > > Signed-off-by: Tamas K Lengyel 
> > >
> > > Reviewed-by: Roger Pau Monné 
> >
> > Thanks!
> >
> > >
> > > I've been looking however and I'm not able to spot where you copy the
> > > shared_info data, which I think it's also quite critical for the
> > > domain, as it contains the info about event channels (when using L2).
> > > Also for HVM forks the shared info should be mapped at the same gfn as
> > > the parent, or else the child trying to access it will trigger an EPT
> > > fault that won't be able to populate such gfn, because the shared_info
> > > on the parent is already shared between Xen and the parent.
> >
> > Pages that cause an EPT fault but can't be made shared get a new page
> > allocated for them and copied in mem_sharing_fork_page. There are many
> > pages like that, mostly due to QEMU mapping them and thus holding an
> > extra reference. That said, shared_info is manually copied during
> > forking in copy_special_pages, at the end of the function you will
> > find:
> >
> > old_mfn = _mfn(virt_to_mfn(d->shared_info));
> > new_mfn = _mfn(virt_to_mfn(cd->shared_info));
> >
> > copy_domain_page(new_mfn, old_mfn);
>
> Yes, that's indeed fine, you need to copy the contents of the shared
> info page, but for HVM you also need to make sure the shared info page
> is mapped at the same gfn as the parent. HVM guest issue a hypercall
> to request the mapping of the shared info page to a specific gfn,
> since it's not added to the guest physmap by default.
> XENMAPSPACE_shared_info is used in order to request the shared info
> page to be mapped at a specific gfn.
>
> AFAICT during fork such shared info mapping is not replicated to the
> child, and hence the child trying to access the gfn of the shared info
> page would just result in EPT faults that won't be fixed (because the
> parent shared info page cannot be shared with the child)?
>
> You should be able to use get_gpfn_from_mfn in order to get the parent
> gfn where the shared info mfn is mapped (if any), and then replicate
> this in the child using it's own shared info.
>
> On fork reset you should check if the child shared info gfn != parent
> shared info gfn and reinstate the parent state if different from the
> child.

OK, I see what you mean. In the way things set up currently the EPT
fault-loop problem doesn't happen since the page does get copied, it
just gets copied to a new mfn not the one d->shared_info points to. So
whatever issue that may bring it must be subtle because we haven't
noticed any instability.

Also, looking at the save/restore code in libxc it seems to me that
shared_info is actually a PV specific page and it doesn't get
saved/restored with an HVM domain. The hvm code paths don't process
REC_TYPE_SHARED_INFO at all. So since forks are exclusively for HVM
domains, do we really need it and if so, why doesn't HVM save/restore
need it?

Tamas

[PATCH] mem_sharing: map shared_info page to same gfn during fork

2020-04-27 Thread Tamas K Lengyel

During a VM fork we copy the shared_info page; however, we also need to ensure
that the page is mapped into the same GFN in the fork as its in the parent.

Signed-off-by: Tamas K Lengyel 
Suggested-by: Roger Pau Monne 
---
 xen/arch/x86/mm/mem_sharing.c | 29 +
 1 file changed, 29 insertions(+)

diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
index 344a5bfb3d..acbf21b22c 100644
--- a/xen/arch/x86/mm/mem_sharing.c
+++ b/xen/arch/x86/mm/mem_sharing.c
@@ -1656,6 +1656,7 @@ static void copy_tsc(struct domain *cd, struct domain *d)
 static int copy_special_pages(struct domain *cd, struct domain *d)
 {
 mfn_t new_mfn, old_mfn;
+gfn_t old_gfn;
 struct p2m_domain *p2m = p2m_get_hostp2m(cd);
 static const unsigned int params[] =
 {
@@ -1701,6 +1702,34 @@ static int copy_special_pages(struct domain *cd, struct 
domain *d)
 new_mfn = _mfn(virt_to_mfn(cd->shared_info));
 copy_domain_page(new_mfn, old_mfn);
 
+old_gfn = _gfn(get_gpfn_from_mfn(mfn_x(old_mfn)));
+
+if ( !gfn_eq(old_gfn, INVALID_GFN) )
+{
+/* let's make sure shared_info is mapped to the same gfn */
+gfn_t new_gfn = _gfn(get_gpfn_from_mfn(mfn_x(new_mfn)));
+
+if ( !gfn_eq(new_gfn, INVALID_GFN) && !gfn_eq(old_gfn, new_gfn) )
+{
+/* if shared info is mapped to a different gfn just remove it */
+rc = p2m->set_entry(p2m, new_gfn, INVALID_MFN, PAGE_ORDER_4K,
+p2m_invalid, p2m->default_access, -1);
+if ( rc )
+return rc;
+
+new_gfn = INVALID_GFN;
+}
+
+if ( gfn_eq(new_gfn, INVALID_GFN) )
+{
+/* if shared info is not currently mapped then map it */
+rc = p2m->set_entry(p2m, old_gfn, new_mfn, PAGE_ORDER_4K,
+p2m_ram_rw, p2m->default_access, -1);
+if ( rc )
+return rc;
+}
+}
+
 return 0;
 }
 
-- 
2.20.1

Re: [PATCH v17 1/2] mem_sharing: fix sharability check during fork reset

2020-04-24 Thread Tamas K Lengyel

On Fri, Apr 24, 2020 at 3:44 AM Roger Pau Monné  wrote:
>
> On Thu, Apr 23, 2020 at 08:30:06AM -0700, Tamas K Lengyel wrote:
> > When resetting a VM fork we ought to only remove pages that were allocated 
> > for
> > the fork during it's execution and the contents copied over from the parent.
> > This can be determined if the page is sharable as special pages used by the
> > fork for other purposes will not pass this test. Unfortunately during the 
> > fork
> > reset loop we only partially check whether that's the case. A page's type 
> > may
> > indicate it is sharable (pass p2m_is_sharable) but that's not a sufficient
> > check by itself. All checks that are normally performed before a page is
> > converted to the sharable type need to be performed to avoid removing pages
> > from the p2m that may be used for other purposes. For example, currently the
> > reset loop also removes the vcpu info pages from the p2m, potentially 
> > putting
> > the guest into infinite page-fault loops.
> >
> > Signed-off-by: Tamas K Lengyel 
>
> Reviewed-by: Roger Pau Monné 

Thanks!

>
> I've been looking however and I'm not able to spot where you copy the
> shared_info data, which I think it's also quite critical for the
> domain, as it contains the info about event channels (when using L2).
> Also for HVM forks the shared info should be mapped at the same gfn as
> the parent, or else the child trying to access it will trigger an EPT
> fault that won't be able to populate such gfn, because the shared_info
> on the parent is already shared between Xen and the parent.

Pages that cause an EPT fault but can't be made shared get a new page
allocated for them and copied in mem_sharing_fork_page. There are many
pages like that, mostly due to QEMU mapping them and thus holding an
extra reference. That said, shared_info is manually copied during
forking in copy_special_pages, at the end of the function you will
find:

old_mfn = _mfn(virt_to_mfn(d->shared_info));
new_mfn = _mfn(virt_to_mfn(cd->shared_info));

copy_domain_page(new_mfn, old_mfn);

Cheers,
Tamas

Re: [ANNOUNCE] Xen 4.14 Development Update

2020-04-24 Thread Tamas K Lengyel

> *  VM forking (v11)
>   -  Tamas K Lengyel

v17 sent recently, hypervisor side is completely merged, only the
toolstack patch is waiting for review & merge

Tamas

Re: [PATCH v16 1/3] mem_sharing: fix sharability check during fork reset

2020-04-22 Thread Tamas K Lengyel

On Wed, Apr 22, 2020 at 9:50 AM Roger Pau Monné  wrote:
>
> On Wed, Apr 22, 2020 at 06:42:42AM -0600, Tamas K Lengyel wrote:
> > On Wed, Apr 22, 2020 at 3:00 AM Roger Pau Monné  
> > wrote:
> > >
> > > On Tue, Apr 21, 2020 at 10:47:23AM -0700, Tamas K Lengyel wrote:
> > > > @@ -666,20 +670,31 @@ static int page_make_sharable(struct domain *d,
> > > >   */
> > > >  if ( page->count_info != (PGC_allocated | (2 + expected_refcnt)) )
> > > >  {
> > > > -spin_unlock(>page_alloc_lock);
> > > >  /* Return type count back to zero */
> > > >  put_page_and_type(page);
> > > > -return -E2BIG;
> > > > +rc = -E2BIG;
> > > > +goto out;
> > > > +}
> > > > +
> > > > +rc = 0;
> > > > +
> > > > +if ( validate_only )
> > > > +{
> > > > +put_page_and_type(page);
> > >
> > > You seem to check some page attributes but then put the page again,
> > > which looks racy to me. Since you put the page, couldn't the checks
> > > that you have performed be stale by the point the data is consumed by
> > > the caller?
> >
> > During fork reset when this is called with validate_only = true the
> > domain is paused. Furthermore, fork reset is only for forks that have
> > no device model running, so nothing is interacting with its memory
> > that could acquire extra references. So no, this isn't racy since
> > there is nothing to race against that I'm aware of. Also, this check
> > is really to check for special pages, all of which are setup during
> > the initial fork process, not during runtime of the fork.
>
> Right, it would feel safer to me however if you just return from
> page_make_sharable while having a page reference, and drop it in
> mem_sharing_fork_reset if the page shouldn't be removed from the fork.
>
> This way you could also avoid having to take an extra reference just
> after returning from nominate_page in mem_sharing_fork_reset.
> page_make_sharable already returns while having taken an extra
> reference to the page in the non validate only case anyway.

Ah yea, that would make sense. Good idea!

> > >
> > > > +goto out;
> > > >  }
> > > >
> > > >  page_set_owner(page, dom_cow);
> > > >  drop_dom_ref = !domain_adjust_tot_pages(d, -1);
> > > >  page_list_del(page, >page_list);
> > > > -spin_unlock(>page_alloc_lock);
> > > >
> > > > +out:
> > > > +if ( !validate_only )
> > > > +spin_unlock(>page_alloc_lock);
> > > >  if ( drop_dom_ref )
> > > >  put_domain(d);
> > > > -return 0;
> > > > +
> > > > +return rc;
> > > >  }
> > > >
> > > >  static int page_make_private(struct domain *d, struct page_info *page)
> > > > @@ -809,8 +824,8 @@ static int debug_gref(struct domain *d, grant_ref_t 
> > > > ref)
> > > >  return debug_gfn(d, gfn);
> > > >  }
> > > >
> > > > -static int nominate_page(struct domain *d, gfn_t gfn,
> > > > - int expected_refcnt, shr_handle_t *phandle)
> > > > +static int nominate_page(struct domain *d, gfn_t gfn, int 
> > > > expected_refcnt,
> > >
> > > Is there any reason for expected_refcnt to be signed? All callers use
> > > unsigned values.
> >
> > No reason. It's just how the code was written by the original author
> > and we never changed it.
>
> Since you are already changing those lines, can I ask you to also
> change it to unsigned in the places that you touch?

Sure thing.

Thanks,
Tamas

Re: [PATCH v16 2/3] mem_sharing: allow forking domain with IOMMU enabled

2020-04-22 Thread Tamas K Lengyel

On Wed, Apr 22, 2020 at 3:09 AM Roger Pau Monné  wrote:
>
> On Tue, Apr 21, 2020 at 10:47:24AM -0700, Tamas K Lengyel wrote:
> > The memory sharing subsystem by default doesn't allow a domain to share 
> > memory
> > if it has an IOMMU active for obvious security reasons. However, when 
> > fuzzing a
> > VM fork, the same security restrictions don't necessarily apply. While it 
> > makes
> > no sense to try to create a full fork of a VM that has an IOMMU attached as 
> > only
> > one domain can own the pass-through device at a time, creating a shallow 
> > fork
> > without a device model is still very useful for fuzzing kernel-mode drivers.
> >
> > By allowing the parent VM to initialize the kernel-mode driver with a real
> > device that's pass-through, the driver can enter into a state more suitable 
> > for
> > fuzzing. Some of these initialization steps are quite complex and are 
> > easier to
> > perform when a real device is present. After the initialization, shallow 
> > forks
> > can be utilized for fuzzing code-segments in the device driver that don't
> > directly interact with the device.
> >
> > Signed-off-by: Tamas K Lengyel 
>
> Reviewed-by: Roger Pau Monné 

Thanks! This can be merged independent of the other patches in the series.

Tamas

Re: [PATCH v16 1/3] mem_sharing: fix sharability check during fork reset

2020-04-22 Thread Tamas K Lengyel

On Wed, Apr 22, 2020 at 3:00 AM Roger Pau Monné  wrote:
>
> On Tue, Apr 21, 2020 at 10:47:23AM -0700, Tamas K Lengyel wrote:
> > When resetting a VM fork we ought to only remove pages that were allocated 
> > for
> > the fork during it's execution and the contents copied over from the parent.
> > This can be determined if the page is sharable as special pages used by the
> > fork for other purposes will not pass this test.
>
> Would it be easier to just check if the page refcount is > 1? (as I
> expect Xen is also holding a reference to this page)

That by itself is not necessarily enough.

>
> > Unfortunately during the fork
> > reset loop we only partially check whether that's the case. A page's type 
> > may
> > indicate it is sharable (pass p2m_is_sharable) but that's not a sufficient
> > check by itself. All checks that are normally performed before a page is
> > converted to the sharable type need to be performed to avoid removing pages
> > from the p2m that may be used for other purposes. For example, currently the
> > reset loop also removes the vcpu info pages from the p2m, potentially 
> > putting
> > the guest into infinite page-fault loops.
> >
> > For this we extend the existing nominate_page and page_make_sharable 
> > functions
> > to perform a validation-only run without actually converting the page.
>
> Maybe you could split that chunk into a separate helper that just
> performs the checks?

I think it's fine this way that we just bail half-way through the
process of making the page shared. Splitting this out to a helper
would require a lot more code-shuffling.

>
> > Signed-off-by: Tamas K Lengyel 
> > ---
> >  xen/arch/x86/mm/mem_sharing.c | 79 ++-
> >  1 file changed, 50 insertions(+), 29 deletions(-)
> >
> > diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
> > index e572e9e39d..d8ed660abb 100644
> > --- a/xen/arch/x86/mm/mem_sharing.c
> > +++ b/xen/arch/x86/mm/mem_sharing.c
> > @@ -633,31 +633,35 @@ unsigned int mem_sharing_get_nr_shared_mfns(void)
> >  /* Functions that change a page's type and ownership */
> >  static int page_make_sharable(struct domain *d,
> >struct page_info *page,
> > -  int expected_refcnt)
> > +  int expected_refcnt,
> > +  bool validate_only)
> >  {
> > -bool_t drop_dom_ref;
> > +int rc;
> > +bool drop_dom_ref = false;
> >
> > -spin_lock(>page_alloc_lock);
> > +/* caller already has the lock when validating only */
> > +if ( !validate_only )
> > +spin_lock(>page_alloc_lock);
>
> page_alloc_lock seems to be used as a recursive lock by some callers,
> could you do the same here?

We can do that, yes.

>
> >
> >  if ( d->is_dying )
> >  {
> > -spin_unlock(>page_alloc_lock);
> > -return -EBUSY;
> > +rc = -EBUSY;
> > +goto out;
> >  }
> >
> >  /* Change page type and count atomically */
> >  if ( !get_page_and_type(page, d, PGT_shared_page) )
> >  {
> > -spin_unlock(>page_alloc_lock);
> > -return -EINVAL;
> > +rc = -EINVAL;
> > +goto out;
> >  }
> >
> >  /* Check it wasn't already sharable and undo if it was */
> >  if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
> >  {
> > -spin_unlock(>page_alloc_lock);
> >  put_page_and_type(page);
> > -return -EEXIST;
> > +rc = -EEXIST;
> > +goto out;
> >  }
> >
> >  /*
> > @@ -666,20 +670,31 @@ static int page_make_sharable(struct domain *d,
> >   */
> >  if ( page->count_info != (PGC_allocated | (2 + expected_refcnt)) )
> >  {
> > -spin_unlock(>page_alloc_lock);
> >  /* Return type count back to zero */
> >  put_page_and_type(page);
> > -return -E2BIG;
> > +rc = -E2BIG;
> > +goto out;
> > +}
> > +
> > +rc = 0;
> > +
> > +if ( validate_only )
> > +{
> > +put_page_and_type(page);
>
> You seem to check some page attributes but then put the page again,
> which looks racy to me. Since you put the page, couldn't the checks
> that you have performed be stale by the point the data is consumed by
> the caller?

During fork reset when this is called with validate_

[PATCH v17 2/2] xen/tools: VM forking toolstack side

2020-04-23 Thread Tamas K Lengyel

Add necessary bits to implement "xl fork-vm" commands. The command allows the
user to specify how to launch the device model allowing for a late-launch model
in which the user can execute the fork without the device model and decide to
only later launch it.

Signed-off-by: Tamas K Lengyel 
---
 docs/man/xl.1.pod.in  |  49 +
 tools/libxc/include/xenctrl.h |  14 ++
 tools/libxc/xc_memshr.c   |  26 +++
 tools/libxl/libxl.h   |  12 ++
 tools/libxl/libxl_create.c| 361 +++---
 tools/libxl/libxl_dm.c|   2 +-
 tools/libxl/libxl_dom.c   |  43 +++-
 tools/libxl/libxl_internal.h  |   7 +
 tools/libxl/libxl_types.idl   |   1 +
 tools/libxl/libxl_x86.c   |  42 
 tools/xl/Makefile |   2 +-
 tools/xl/xl.h |   5 +
 tools/xl/xl_cmdtable.c|  15 ++
 tools/xl/xl_forkvm.c  | 149 ++
 tools/xl/xl_vmcontrol.c   |  14 ++
 15 files changed, 576 insertions(+), 166 deletions(-)
 create mode 100644 tools/xl/xl_forkvm.c

diff --git a/docs/man/xl.1.pod.in b/docs/man/xl.1.pod.in
index 09339282e6..67b4e8588a 100644
--- a/docs/man/xl.1.pod.in
+++ b/docs/man/xl.1.pod.in
@@ -708,6 +708,55 @@ above).
 
 =back
 
+=item B [I] I
+
+Create a fork of a running VM.  The domain will be paused after the operation
+and remains paused while forks of it exist.  Experimental and x86 only.
+Forks can only be made of domains with HAP enabled and on Intel hardware.  The
+parent domain must be created with the xl toolstack and its configuration must
+not manually define max_grant_frames, max_maptrack_frames or 
max_event_channels.
+
+B
+
+=over 4
+
+=item B<-p>
+
+Leave the fork paused after creating it.
+
+=item B<--launch-dm>
+
+Specify whether the device model (QEMU) should be launched for the fork. Late
+launch allows to start the device model for an already running fork.
+
+=item B<-C>
+
+The config file to use when launching the device model.  Currently required 
when
+launching the device model.  Most config settings MUST match the parent domain
+exactly, only change VM name, disk path and network configurations.
+
+=item B<-Q>
+
+The path to the qemu save file to use when launching the device model.  
Currently
+required when launching the device model.
+
+=item B<--fork-reset>
+
+Perform a reset operation of an already running fork.  Note that resetting may
+be less performant then creating a new fork depending on how much memory the
+fork has deduplicated during its runtime.
+
+=item B<--max-vcpus>
+
+Specify the max-vcpus matching the parent domain when not launching the dm.
+
+=item B<--allow-iommu>
+
+Specify to allow forking a domain that has IOMMU enabled. Only compatible with
+forks using --launch-dm no.
+
+=back
+
 =item B [I]
 
 Display the number of shared pages for a specified domain. If no domain is
diff --git a/tools/libxc/include/xenctrl.h b/tools/libxc/include/xenctrl.h
index 5f25c5a6d4..0a6ff93229 100644
--- a/tools/libxc/include/xenctrl.h
+++ b/tools/libxc/include/xenctrl.h
@@ -2232,6 +2232,20 @@ int xc_memshr_range_share(xc_interface *xch,
   uint64_t first_gfn,
   uint64_t last_gfn);
 
+int xc_memshr_fork(xc_interface *xch,
+   uint32_t source_domain,
+   uint32_t client_domain,
+   bool allow_with_iommu);
+
+/*
+ * Note: this function is only intended to be used on short-lived forks that
+ * haven't yet aquired a lot of memory. In case the fork has a lot of memory
+ * it is likely more performant to create a new fork with xc_memshr_fork.
+ *
+ * With VMs that have a lot of memory this call may block for a long time.
+ */
+int xc_memshr_fork_reset(xc_interface *xch, uint32_t forked_domain);
+
 /* Debug calls: return the number of pages referencing the shared frame backing
  * the input argument. Should be one or greater.
  *
diff --git a/tools/libxc/xc_memshr.c b/tools/libxc/xc_memshr.c
index 97e2e6a8d9..2300cc7075 100644
--- a/tools/libxc/xc_memshr.c
+++ b/tools/libxc/xc_memshr.c
@@ -239,6 +239,32 @@ int xc_memshr_debug_gref(xc_interface *xch,
 return xc_memshr_memop(xch, domid, );
 }
 
+int xc_memshr_fork(xc_interface *xch, uint32_t pdomid, uint32_t domid,
+   bool allow_with_iommu)
+{
+xen_mem_sharing_op_t mso;
+
+memset(, 0, sizeof(mso));
+
+mso.op = XENMEM_sharing_op_fork;
+mso.u.fork.parent_domain = pdomid;
+
+if ( allow_with_iommu )
+mso.u.fork.flags |= XENMEM_FORK_WITH_IOMMU_ALLOWED;
+
+return xc_memshr_memop(xch, domid, );
+}
+
+int xc_memshr_fork_reset(xc_interface *xch, uint32_t domid)
+{
+xen_mem_sharing_op_t mso;
+
+memset(, 0, sizeof(mso));
+mso.op = XENMEM_sharing_op_fork_reset;
+
+return xc_memshr_memop(xch, domid, );
+}
+
 int xc_memshr_audit(xc_interface *xch)
 {
 xen_mem_sharing_op_t mso;
diff --git a/tools/libxl/libxl.h b/tools/libxl/libxl.h
index 71709dc58

[PATCH v17 1/2] mem_sharing: fix sharability check during fork reset

2020-04-23 Thread Tamas K Lengyel

When resetting a VM fork we ought to only remove pages that were allocated for
the fork during it's execution and the contents copied over from the parent.
This can be determined if the page is sharable as special pages used by the
fork for other purposes will not pass this test. Unfortunately during the fork
reset loop we only partially check whether that's the case. A page's type may
indicate it is sharable (pass p2m_is_sharable) but that's not a sufficient
check by itself. All checks that are normally performed before a page is
converted to the sharable type need to be performed to avoid removing pages
from the p2m that may be used for other purposes. For example, currently the
reset loop also removes the vcpu info pages from the p2m, potentially putting
the guest into infinite page-fault loops.

Signed-off-by: Tamas K Lengyel 
---
v17: Changes based on feedback from Roger
---
 xen/arch/x86/mm/mem_sharing.c | 83 ---
 1 file changed, 47 insertions(+), 36 deletions(-)

diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
index bb74595351..344a5bfb3d 100644
--- a/xen/arch/x86/mm/mem_sharing.c
+++ b/xen/arch/x86/mm/mem_sharing.c
@@ -633,31 +633,33 @@ unsigned int mem_sharing_get_nr_shared_mfns(void)
 /* Functions that change a page's type and ownership */
 static int page_make_sharable(struct domain *d,
   struct page_info *page,
-  int expected_refcnt)
+  unsigned int expected_refcnt,
+  bool validate_only)
 {
-bool_t drop_dom_ref;
+int rc = 0;
+bool drop_dom_ref = false;
 
-spin_lock(>page_alloc_lock);
+spin_lock_recursive(>page_alloc_lock);
 
 if ( d->is_dying )
 {
-spin_unlock(>page_alloc_lock);
-return -EBUSY;
+rc = -EBUSY;
+goto out;
 }
 
 /* Change page type and count atomically */
 if ( !get_page_and_type(page, d, PGT_shared_page) )
 {
-spin_unlock(>page_alloc_lock);
-return -EINVAL;
+rc = -EINVAL;
+goto out;
 }
 
 /* Check it wasn't already sharable and undo if it was */
 if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
 {
-spin_unlock(>page_alloc_lock);
 put_page_and_type(page);
-return -EEXIST;
+rc = -EEXIST;
+goto out;
 }
 
 /*
@@ -666,20 +668,26 @@ static int page_make_sharable(struct domain *d,
  */
 if ( page->count_info != (PGC_allocated | (2 + expected_refcnt)) )
 {
-spin_unlock(>page_alloc_lock);
 /* Return type count back to zero */
 put_page_and_type(page);
-return -E2BIG;
+rc = -E2BIG;
+goto out;
 }
 
-page_set_owner(page, dom_cow);
-drop_dom_ref = !domain_adjust_tot_pages(d, -1);
-page_list_del(page, >page_list);
-spin_unlock(>page_alloc_lock);
+if ( !validate_only )
+{
+page_set_owner(page, dom_cow);
+drop_dom_ref = !domain_adjust_tot_pages(d, -1);
+page_list_del(page, >page_list);
+}
+
+out:
+spin_unlock_recursive(>page_alloc_lock);
 
 if ( drop_dom_ref )
 put_domain(d);
-return 0;
+
+return rc;
 }
 
 static int page_make_private(struct domain *d, struct page_info *page)
@@ -810,7 +818,8 @@ static int debug_gref(struct domain *d, grant_ref_t ref)
 }
 
 static int nominate_page(struct domain *d, gfn_t gfn,
- int expected_refcnt, shr_handle_t *phandle)
+ unsigned int expected_refcnt, bool validate_only,
+ shr_handle_t *phandle)
 {
 struct p2m_domain *hp2m = p2m_get_hostp2m(d);
 p2m_type_t p2mt;
@@ -879,8 +888,8 @@ static int nominate_page(struct domain *d, gfn_t gfn,
 }
 
 /* Try to convert the mfn to the sharable type */
-ret = page_make_sharable(d, page, expected_refcnt);
-if ( ret )
+ret = page_make_sharable(d, page, expected_refcnt, validate_only);
+if ( ret || validate_only )
 goto out;
 
 /*
@@ -1392,13 +1401,13 @@ static int range_share(struct domain *d, struct domain 
*cd,
  * We only break out if we run out of memory as individual pages may
  * legitimately be unsharable and we just want to skip over those.
  */
-rc = nominate_page(d, _gfn(start), 0, );
+rc = nominate_page(d, _gfn(start), 0, false, );
 if ( rc == -ENOMEM )
 break;
 
 if ( !rc )
 {
-rc = nominate_page(cd, _gfn(start), 0, );
+rc = nominate_page(cd, _gfn(start), 0, false, );
 if ( rc == -ENOMEM )
 break;
 
@@ -1478,7 +1487,7 @@ int mem_sharing_fork_page(struct domain *d, gfn_t gfn, 
bool unsharing)
 /* For read-only accesses we just add a shared entry to the physmap */
 while ( parent )
 {
-if ( !(rc = nominate_page(

Re: [PATCH v17 2/2] xen/tools: VM forking toolstack side

2020-05-01 Thread Tamas K Lengyel

On Thu, Apr 23, 2020 at 9:33 AM Tamas K Lengyel  wrote:
>
> Add necessary bits to implement "xl fork-vm" commands. The command allows the
> user to specify how to launch the device model allowing for a late-launch 
> model
> in which the user can execute the fork without the device model and decide to
> only later launch it.
>
> Signed-off-by: Tamas K Lengyel 

Patch ping. If nothing else at least the libxc parts would be nice to
get merged before the freeze.

[PATCH v16 2/3] mem_sharing: allow forking domain with IOMMU enabled

2020-04-21 Thread Tamas K Lengyel

The memory sharing subsystem by default doesn't allow a domain to share memory
if it has an IOMMU active for obvious security reasons. However, when fuzzing a
VM fork, the same security restrictions don't necessarily apply. While it makes
no sense to try to create a full fork of a VM that has an IOMMU attached as only
one domain can own the pass-through device at a time, creating a shallow fork
without a device model is still very useful for fuzzing kernel-mode drivers.

By allowing the parent VM to initialize the kernel-mode driver with a real
device that's pass-through, the driver can enter into a state more suitable for
fuzzing. Some of these initialization steps are quite complex and are easier to
perform when a real device is present. After the initialization, shallow forks
can be utilized for fuzzing code-segments in the device driver that don't
directly interact with the device.

Signed-off-by: Tamas K Lengyel 
---
v16: Minor fixes based on feedback
---
 xen/arch/x86/mm/mem_sharing.c | 20 +---
 xen/include/public/memory.h   |  4 +++-
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
index d8ed660abb..e690d2fa13 100644
--- a/xen/arch/x86/mm/mem_sharing.c
+++ b/xen/arch/x86/mm/mem_sharing.c
@@ -1445,7 +1445,8 @@ static int range_share(struct domain *d, struct domain 
*cd,
 return rc;
 }
 
-static inline int mem_sharing_control(struct domain *d, bool enable)
+static inline int mem_sharing_control(struct domain *d, bool enable,
+  uint16_t flags)
 {
 if ( enable )
 {
@@ -1455,7 +1456,8 @@ static inline int mem_sharing_control(struct domain *d, 
bool enable)
 if ( unlikely(!hap_enabled(d)) )
 return -ENODEV;
 
-if ( unlikely(is_iommu_enabled(d)) )
+if ( unlikely(is_iommu_enabled(d) &&
+  !(flags & XENMEM_FORK_WITH_IOMMU_ALLOWED)) )
 return -EXDEV;
 }
 
@@ -1848,7 +1850,8 @@ int 
mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg)
 if ( rc )
 goto out;
 
-if ( !mem_sharing_enabled(d) && (rc = mem_sharing_control(d, true)) )
+if ( !mem_sharing_enabled(d) &&
+ (rc = mem_sharing_control(d, true, 0)) )
 return rc;
 
 switch ( mso.op )
@@ -2086,7 +2089,9 @@ int 
mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg)
 struct domain *pd;
 
 rc = -EINVAL;
-if ( mso.u.fork.pad[0] || mso.u.fork.pad[1] || mso.u.fork.pad[2] )
+if ( mso.u.fork.pad )
+goto out;
+if ( mso.u.fork.flags & ~XENMEM_FORK_WITH_IOMMU_ALLOWED )
 goto out;
 
 rc = rcu_lock_live_remote_domain_by_id(mso.u.fork.parent_domain,
@@ -2101,7 +2106,8 @@ int 
mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg)
 goto out;
 }
 
-if ( !mem_sharing_enabled(pd) && (rc = mem_sharing_control(pd, true)) )
+if ( !mem_sharing_enabled(pd) &&
+ (rc = mem_sharing_control(pd, true, mso.u.fork.flags)) )
 {
 rcu_unlock_domain(pd);
 goto out;
@@ -2122,7 +2128,7 @@ int 
mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg)
 struct domain *pd;
 
 rc = -EINVAL;
-if ( mso.u.fork.pad[0] || mso.u.fork.pad[1] || mso.u.fork.pad[2] )
+if ( mso.u.fork.pad || mso.u.fork.flags )
 goto out;
 
 rc = -ENOSYS;
@@ -2159,7 +2165,7 @@ int mem_sharing_domctl(struct domain *d, struct 
xen_domctl_mem_sharing_op *mec)
 switch ( mec->op )
 {
 case XEN_DOMCTL_MEM_SHARING_CONTROL:
-rc = mem_sharing_control(d, mec->u.enable);
+rc = mem_sharing_control(d, mec->u.enable, 0);
 break;
 
 default:
diff --git a/xen/include/public/memory.h b/xen/include/public/memory.h
index d36d64b8dc..e56800357d 100644
--- a/xen/include/public/memory.h
+++ b/xen/include/public/memory.h
@@ -536,7 +536,9 @@ struct xen_mem_sharing_op {
 } debug;
 struct mem_sharing_op_fork {  /* OP_FORK */
 domid_t parent_domain;/* IN: parent's domain id */
-uint16_t pad[3];  /* Must be set to 0 */
+#define XENMEM_FORK_WITH_IOMMU_ALLOWED (1u << 0)
+uint16_t flags;   /* IN: optional settings */
+uint32_t pad; /* Must be set to 0 */
 } fork;
 } u;
 };
-- 
2.20.1

[PATCH v16 0/3] VM forking

2020-04-21 Thread Tamas K Lengyel

The following patches are part of the series that implement VM forking for
Intel HVM guests to allow for the fast creation of identical VMs without the
assosciated high startup costs of booting or restoring the VM from a savefile.

JIRA issue: https://xenproject.atlassian.net/browse/XEN-89

The fork operation is implemented as part of the "xl fork-vm" command:
xl fork-vm -C  -Q  -m  

By default a fully functional fork is created. The user is in charge however to
create the appropriate config file for the fork and to generate the QEMU save
file before the fork-vm call is made. The config file needs to give the
fork a new name at minimum but other settings may also require changes. Certain
settings in the config file of both the parent and the fork have to be set to
default. Details are documented.

The interface also allows to split the forking into two steps:
xl fork-vm --launch-dm no \
   -m  \
   -p 
xl fork-vm --launch-dm late \
   -C  \
   -Q  \
   

The split creation model is useful when the VM needs to be created as fast as
possible. The forked VM can be unpaused without the device model being launched
to be monitored and accessed via VMI. Note however that without its device
model running (depending on what is executing in the VM) it is bound to
misbehave or even crash when its trying to access devices that would be
emulated by QEMU. We anticipate that for certain use-cases this would be an
acceptable situation, in case for example when fuzzing is performed of code
segments that don't access such devices.

Launching the device model requires the QEMU Xen savefile to be generated
manually from the parent VM. This can be accomplished simply by connecting to
its QMP socket and issuing the "xen-save-devices-state" command. For example
using the standard tool socat these commands can be used to generate the file:
socat - UNIX-CONNECT:/var/run/xen/qmp-libxl-
{ "execute": "qmp_capabilities" }
{ "execute": "xen-save-devices-state", \
"arguments": { "filename": "/path/to/save/qemu_state", \
"live": false} }

At runtime the forked VM starts running with an empty p2m which gets lazily
populated when the VM generates EPT faults, similar to how altp2m views are
populated. If the memory access is a read-only access, the p2m entry is
populated with a memory shared entry with its parent. For write memory accesses
or in case memory sharing wasn't possible (for example in case a reference is
held by a third party), a new page is allocated and the page contents are
copied over from the parent VM. Forks can be further forked if needed, thus
allowing for further memory savings.

A VM fork reset hypercall is also added that allows the fork to be reset to the
state it was just after a fork, also accessible via xl:
xl fork-vm --fork-reset -p 

This is an optimization for cases where the forks are very short-lived and run
without a device model, so resetting saves some time compared to creating a
brand new fork provided the fork has not aquired a lot of memory. If the fork
has a lot of memory deduplicated it is likely going to be faster to create a
new fork from scratch and asynchronously destroying the old one.

The series has been tested with Windows VMs and functions as expected. Linux
VMs when forked from a running VM will have a frozen VNC screen. Linux VMs at
this time can only be forked with a working device model when the parent VM was
restored from a snapshot using "xl restore -p". This is a known limitation.
Also note that PVHVM/PVH Linux guests have not been tested. Forking most likely
works but PV devices and drivers would require additional wiring to set things
up properly since the guests are unaware of the forking taking place, unlike
the save/restore routine where the guest is made aware of the procedure.

Forking time has been measured to be 0.0007s, device model launch to be around
1s depending largely on the number of devices being emulated. Fork resets have
been measured to be 0.0001s under the optimal circumstances.

New in v16:
A better bugfix for fork reset issue
Minor fixes for the IOMMU allow patch based on feedback

Patch 1 fix for VM fork reset removing pages from the p2m that it shouldn't
Patch 2 adds option to fork a domain with IOMMU active
Patch 3 adds the toolstack-side code implementing VM forking and reset

Tamas K Lengyel (3):
  mem_sharing: fix sharability check during fork reset
  mem_sharing: allow forking domain with IOMMU enabled
  xen/tools: VM forking toolstack side

 docs/man/xl.1.pod.in  |  44 +
 tools/libxc/include/xenctrl.h |  14 ++
 tools/libxc/xc_memshr.c   |  26 +++
 tools/libxl/libxl.h   |  12 ++
 tools/libxl/libxl_create.c| 361 +++---
 tools/libxl/libxl_dm.c|   2 +-

[PATCH v16 1/3] mem_sharing: fix sharability check during fork reset

2020-04-21 Thread Tamas K Lengyel

When resetting a VM fork we ought to only remove pages that were allocated for
the fork during it's execution and the contents copied over from the parent.
This can be determined if the page is sharable as special pages used by the
fork for other purposes will not pass this test. Unfortunately during the fork
reset loop we only partially check whether that's the case. A page's type may
indicate it is sharable (pass p2m_is_sharable) but that's not a sufficient
check by itself. All checks that are normally performed before a page is
converted to the sharable type need to be performed to avoid removing pages
from the p2m that may be used for other purposes. For example, currently the
reset loop also removes the vcpu info pages from the p2m, potentially putting
the guest into infinite page-fault loops.

For this we extend the existing nominate_page and page_make_sharable functions
to perform a validation-only run without actually converting the page.

Signed-off-by: Tamas K Lengyel 
---
 xen/arch/x86/mm/mem_sharing.c | 79 ++-
 1 file changed, 50 insertions(+), 29 deletions(-)

diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
index e572e9e39d..d8ed660abb 100644
--- a/xen/arch/x86/mm/mem_sharing.c
+++ b/xen/arch/x86/mm/mem_sharing.c
@@ -633,31 +633,35 @@ unsigned int mem_sharing_get_nr_shared_mfns(void)
 /* Functions that change a page's type and ownership */
 static int page_make_sharable(struct domain *d,
   struct page_info *page,
-  int expected_refcnt)
+  int expected_refcnt,
+  bool validate_only)
 {
-bool_t drop_dom_ref;
+int rc;
+bool drop_dom_ref = false;
 
-spin_lock(>page_alloc_lock);
+/* caller already has the lock when validating only */
+if ( !validate_only )
+spin_lock(>page_alloc_lock);
 
 if ( d->is_dying )
 {
-spin_unlock(>page_alloc_lock);
-return -EBUSY;
+rc = -EBUSY;
+goto out;
 }
 
 /* Change page type and count atomically */
 if ( !get_page_and_type(page, d, PGT_shared_page) )
 {
-spin_unlock(>page_alloc_lock);
-return -EINVAL;
+rc = -EINVAL;
+goto out;
 }
 
 /* Check it wasn't already sharable and undo if it was */
 if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
 {
-spin_unlock(>page_alloc_lock);
 put_page_and_type(page);
-return -EEXIST;
+rc = -EEXIST;
+goto out;
 }
 
 /*
@@ -666,20 +670,31 @@ static int page_make_sharable(struct domain *d,
  */
 if ( page->count_info != (PGC_allocated | (2 + expected_refcnt)) )
 {
-spin_unlock(>page_alloc_lock);
 /* Return type count back to zero */
 put_page_and_type(page);
-return -E2BIG;
+rc = -E2BIG;
+goto out;
+}
+
+rc = 0;
+
+if ( validate_only )
+{
+put_page_and_type(page);
+goto out;
 }
 
 page_set_owner(page, dom_cow);
 drop_dom_ref = !domain_adjust_tot_pages(d, -1);
 page_list_del(page, >page_list);
-spin_unlock(>page_alloc_lock);
 
+out:
+if ( !validate_only )
+spin_unlock(>page_alloc_lock);
 if ( drop_dom_ref )
 put_domain(d);
-return 0;
+
+return rc;
 }
 
 static int page_make_private(struct domain *d, struct page_info *page)
@@ -809,8 +824,8 @@ static int debug_gref(struct domain *d, grant_ref_t ref)
 return debug_gfn(d, gfn);
 }
 
-static int nominate_page(struct domain *d, gfn_t gfn,
- int expected_refcnt, shr_handle_t *phandle)
+static int nominate_page(struct domain *d, gfn_t gfn, int expected_refcnt,
+ bool validate_only, shr_handle_t *phandle)
 {
 struct p2m_domain *hp2m = p2m_get_hostp2m(d);
 p2m_type_t p2mt;
@@ -879,8 +894,8 @@ static int nominate_page(struct domain *d, gfn_t gfn,
 }
 
 /* Try to convert the mfn to the sharable type */
-ret = page_make_sharable(d, page, expected_refcnt);
-if ( ret )
+ret = page_make_sharable(d, page, expected_refcnt, validate_only);
+if ( ret || validate_only )
 goto out;
 
 /*
@@ -1392,13 +1407,13 @@ static int range_share(struct domain *d, struct domain 
*cd,
  * We only break out if we run out of memory as individual pages may
  * legitimately be unsharable and we just want to skip over those.
  */
-rc = nominate_page(d, _gfn(start), 0, );
+rc = nominate_page(d, _gfn(start), 0, false, );
 if ( rc == -ENOMEM )
 break;
 
 if ( !rc )
 {
-rc = nominate_page(cd, _gfn(start), 0, );
+rc = nominate_page(cd, _gfn(start), 0, false, );
 if ( rc == -ENOMEM )
 break;
 
@@ -1476,7 +1491,7 @@ int mem_sharing_fork_page(struct domain *d, gfn_t gfn, 
b

[PATCH v16 3/3] xen/tools: VM forking toolstack side

2020-04-21 Thread Tamas K Lengyel

Add necessary bits to implement "xl fork-vm" commands. The command allows the
user to specify how to launch the device model allowing for a late-launch model
in which the user can execute the fork without the device model and decide to
only later launch it.

Signed-off-by: Tamas K Lengyel 
---
 docs/man/xl.1.pod.in  |  44 +
 tools/libxc/include/xenctrl.h |  14 ++
 tools/libxc/xc_memshr.c   |  26 +++
 tools/libxl/libxl.h   |  12 ++
 tools/libxl/libxl_create.c| 361 +++---
 tools/libxl/libxl_dm.c|   2 +-
 tools/libxl/libxl_dom.c   |  43 +++-
 tools/libxl/libxl_internal.h  |   7 +
 tools/libxl/libxl_types.idl   |   1 +
 tools/libxl/libxl_x86.c   |  42 
 tools/xl/Makefile |   2 +-
 tools/xl/xl.h |   5 +
 tools/xl/xl_cmdtable.c|  15 ++
 tools/xl/xl_forkvm.c  | 149 ++
 tools/xl/xl_vmcontrol.c   |  14 ++
 15 files changed, 571 insertions(+), 166 deletions(-)
 create mode 100644 tools/xl/xl_forkvm.c

diff --git a/docs/man/xl.1.pod.in b/docs/man/xl.1.pod.in
index 09339282e6..59c03c6427 100644
--- a/docs/man/xl.1.pod.in
+++ b/docs/man/xl.1.pod.in
@@ -708,6 +708,50 @@ above).
 
 =back
 
+=item B [I] I
+
+Create a fork of a running VM.  The domain will be paused after the operation
+and remains paused while forks of it exist.  Experimental and x86 only.
+Forks can only be made of domains with HAP enabled and on Intel hardware.  The
+parent domain must be created with the xl toolstack and its configuration must
+not manually define max_grant_frames, max_maptrack_frames or 
max_event_channels.
+
+B
+
+=over 4
+
+=item B<-p>
+
+Leave the fork paused after creating it.
+
+=item B<--launch-dm>
+
+Specify whether the device model (QEMU) should be launched for the fork. Late
+launch allows to start the device model for an already running fork.
+
+=item B<-C>
+
+The config file to use when launching the device model.  Currently required 
when
+launching the device model.  Most config settings MUST match the parent domain
+exactly, only change VM name, disk path and network configurations.
+
+=item B<-Q>
+
+The path to the qemu save file to use when launching the device model.  
Currently
+required when launching the device model.
+
+=item B<--fork-reset>
+
+Perform a reset operation of an already running fork.  Note that resetting may
+be less performant then creating a new fork depending on how much memory the
+fork has deduplicated during its runtime.
+
+=item B<--max-vcpus>
+
+Specify the max-vcpus matching the parent domain when not launching the dm.
+
+=back
+
 =item B [I]
 
 Display the number of shared pages for a specified domain. If no domain is
diff --git a/tools/libxc/include/xenctrl.h b/tools/libxc/include/xenctrl.h
index 5f25c5a6d4..0a6ff93229 100644
--- a/tools/libxc/include/xenctrl.h
+++ b/tools/libxc/include/xenctrl.h
@@ -2232,6 +2232,20 @@ int xc_memshr_range_share(xc_interface *xch,
   uint64_t first_gfn,
   uint64_t last_gfn);
 
+int xc_memshr_fork(xc_interface *xch,
+   uint32_t source_domain,
+   uint32_t client_domain,
+   bool allow_with_iommu);
+
+/*
+ * Note: this function is only intended to be used on short-lived forks that
+ * haven't yet aquired a lot of memory. In case the fork has a lot of memory
+ * it is likely more performant to create a new fork with xc_memshr_fork.
+ *
+ * With VMs that have a lot of memory this call may block for a long time.
+ */
+int xc_memshr_fork_reset(xc_interface *xch, uint32_t forked_domain);
+
 /* Debug calls: return the number of pages referencing the shared frame backing
  * the input argument. Should be one or greater.
  *
diff --git a/tools/libxc/xc_memshr.c b/tools/libxc/xc_memshr.c
index 97e2e6a8d9..2300cc7075 100644
--- a/tools/libxc/xc_memshr.c
+++ b/tools/libxc/xc_memshr.c
@@ -239,6 +239,32 @@ int xc_memshr_debug_gref(xc_interface *xch,
 return xc_memshr_memop(xch, domid, );
 }
 
+int xc_memshr_fork(xc_interface *xch, uint32_t pdomid, uint32_t domid,
+   bool allow_with_iommu)
+{
+xen_mem_sharing_op_t mso;
+
+memset(, 0, sizeof(mso));
+
+mso.op = XENMEM_sharing_op_fork;
+mso.u.fork.parent_domain = pdomid;
+
+if ( allow_with_iommu )
+mso.u.fork.flags |= XENMEM_FORK_WITH_IOMMU_ALLOWED;
+
+return xc_memshr_memop(xch, domid, );
+}
+
+int xc_memshr_fork_reset(xc_interface *xch, uint32_t domid)
+{
+xen_mem_sharing_op_t mso;
+
+memset(, 0, sizeof(mso));
+mso.op = XENMEM_sharing_op_fork_reset;
+
+return xc_memshr_memop(xch, domid, );
+}
+
 int xc_memshr_audit(xc_interface *xch)
 {
 xen_mem_sharing_op_t mso;
diff --git a/tools/libxl/libxl.h b/tools/libxl/libxl.h
index 71709dc585..d8da347d4e 100644
--- a/tools/libxl/libxl.h
+++ b/tools/libxl/libxl.h
@@ -2666,6 +2666,18 @@ int libxl_psr_get_hw_

Re: Xen Coding style

2020-05-08 Thread Tamas K Lengyel

On Fri, May 8, 2020 at 6:21 AM Julien Grall  wrote:
>
> Hi Jan,
>
> On 08/05/2020 12:20, Jan Beulich wrote:
> > On 08.05.2020 12:00, Julien Grall wrote:
> >> You seem to be the maintainer with the most unwritten rules. Would
> >> you mind to have a try at writing a coding style based on it?
> >
> > On the basis that even small, single aspect patches to CODING_STYLE
> > have been ignored [1],
>
> Your thread is one of the example why I started this thread. Agreeing on
> specific rule doesn't work because it either result to bikesheding or
> there is not enough interest to review rule by rule.
>
> > I don't think this would be a good use of my
> > time.
>
> I would have assumed that the current situation (i.e
> nitpicking/bikeshedding on the ML) is not a good use of your time :).
>
> I would be happy to put some effort to help getting the coding style
> right, however I believe focusing on an overall coding style would value
> everyone's time better than a rule by rule discussion.
>
> > If I was promised (reasonable) feedback, I could take what I
> > have and try to add at least a few more things based on what I find
> > myself commenting on more frequently. But really I'd prefer it to
> > be done the other way around - for people to look at the patches
> > already sent, and for me to only subsequently send more. After all,
> > if already those adjustments are controversial, I don't think we
> > could settle on others.
> While I understand this requires another investment from your part, I am
> afraid it is going to be painful for someone else to go through all the
> existing coding style bikeshedding and infer your unwritten rules.
>
> It might be more beneficial for that person to pursue the work done by
> Tamas and Viktor in the past (see my previous e-mail). This may mean to
> adopt an existing coding style (BSD) and then tweak it.

Thanks Julien for restarting this discussion. IMHO agreeing on a set
of style rules ahead and then applying universally all at once is not
going to be productive since we are so all over the place. Instead, I
would recommend we start piece-by-piece. We introduce a baseline style
checker, then maintainers can decide when and if they want to move
their code-base to be under the automated style checker. That way we
have a baseline and each maintainer can decide on their own term when
they want to have their files be also style checked and in what form.
The upside of this route I think is pretty clear: we can have at least
partial automation even while we figure out what to do with some of
the more problematic files and quirks that are in our code-base. I
would highly prefer this route since I would immediately bring all
files I maintain over to the automated checker just so I never ever
have to deal with this again manually. What style is in use to me
really doesn't matter, BSD was very close with some minor tweaks, or
even what we use to check the style just as long as we have
_something_.

Cheers,
Tamas

Re: Xen Coding style

2020-05-08 Thread Tamas K Lengyel

On Fri, May 8, 2020 at 8:18 AM Jürgen Groß  wrote:
>
> On 08.05.20 14:55, Tamas K Lengyel wrote:
> > On Fri, May 8, 2020 at 6:21 AM Julien Grall  wrote:
> >>
> >> Hi Jan,
> >>
> >> On 08/05/2020 12:20, Jan Beulich wrote:
> >>> On 08.05.2020 12:00, Julien Grall wrote:
> >>>> You seem to be the maintainer with the most unwritten rules. Would
> >>>> you mind to have a try at writing a coding style based on it?
> >>>
> >>> On the basis that even small, single aspect patches to CODING_STYLE
> >>> have been ignored [1],
> >>
> >> Your thread is one of the example why I started this thread. Agreeing on
> >> specific rule doesn't work because it either result to bikesheding or
> >> there is not enough interest to review rule by rule.
> >>
> >>> I don't think this would be a good use of my
> >>> time.
> >>
> >> I would have assumed that the current situation (i.e
> >> nitpicking/bikeshedding on the ML) is not a good use of your time :).
> >>
> >> I would be happy to put some effort to help getting the coding style
> >> right, however I believe focusing on an overall coding style would value
> >> everyone's time better than a rule by rule discussion.
> >>
> >>> If I was promised (reasonable) feedback, I could take what I
> >>> have and try to add at least a few more things based on what I find
> >>> myself commenting on more frequently. But really I'd prefer it to
> >>> be done the other way around - for people to look at the patches
> >>> already sent, and for me to only subsequently send more. After all,
> >>> if already those adjustments are controversial, I don't think we
> >>> could settle on others.
> >> While I understand this requires another investment from your part, I am
> >> afraid it is going to be painful for someone else to go through all the
> >> existing coding style bikeshedding and infer your unwritten rules.
> >>
> >> It might be more beneficial for that person to pursue the work done by
> >> Tamas and Viktor in the past (see my previous e-mail). This may mean to
> >> adopt an existing coding style (BSD) and then tweak it.
> >
> > Thanks Julien for restarting this discussion. IMHO agreeing on a set
> > of style rules ahead and then applying universally all at once is not
> > going to be productive since we are so all over the place. Instead, I
> > would recommend we start piece-by-piece. We introduce a baseline style
> > checker, then maintainers can decide when and if they want to move
> > their code-base to be under the automated style checker. That way we
> > have a baseline and each maintainer can decide on their own term when
> > they want to have their files be also style checked and in what form.
> > The upside of this route I think is pretty clear: we can have at least
> > partial automation even while we figure out what to do with some of
> > the more problematic files and quirks that are in our code-base. I
> > would highly prefer this route since I would immediately bring all
> > files I maintain over to the automated checker just so I never ever
> > have to deal with this again manually. What style is in use to me
> > really doesn't matter, BSD was very close with some minor tweaks, or
> > even what we use to check the style just as long as we have
> > _something_.
>
> Wouldn't it make more sense to have a patch checker instead and accept
> only patches which change code according to the style guide? This
> wouldn't require to change complete files at a time.

In theory, yes. But in practice this would require that we can agree
on a style that applies to all patches that touch any file within Xen.
We can't seem to do that because there are too many exceptions and
corner-cases and personal-preferences of maintainers that apply only
to a subset of the codebase. So AFAICT what you propose doesn't seem
to be a viable way to start.

Tamas

[PATCH 2/3] xen/vm_event: add vm_event_check_pending_op

2020-05-15 Thread Tamas K Lengyel

Perform sanity checking when shutting vm_event down to determine whether
it is safe to do so. Error out with -EAGAIN in case pending operations
have been found for the domain.

Signed-off-by: Tamas K Lengyel 
---
 xen/arch/x86/vm_event.c| 23 +++
 xen/common/vm_event.c  | 17 ++---
 xen/include/asm-arm/vm_event.h |  7 +++
 xen/include/asm-x86/vm_event.h |  2 ++
 4 files changed, 46 insertions(+), 3 deletions(-)

diff --git a/xen/arch/x86/vm_event.c b/xen/arch/x86/vm_event.c
index 848d69c1b0..558b7da4b1 100644
--- a/xen/arch/x86/vm_event.c
+++ b/xen/arch/x86/vm_event.c
@@ -297,6 +297,29 @@ void vm_event_emulate_check(struct vcpu *v, 
vm_event_response_t *rsp)
 };
 }
 
+bool vm_event_check_pending_op(struct vcpu *v)
+{
+struct monitor_write_data *w = >arch.vm_event->write_data;
+
+if ( !v->arch.vm_event->sync_event )
+return false;
+
+if ( w->do_write.cr0 )
+return true;
+if ( w->do_write.cr3 )
+return true;
+if ( w->do_write.cr4 )
+return true;
+if ( w->do_write.msr )
+return true;
+if ( v->arch.vm_event->set_gprs )
+return true;
+if ( v->arch.vm_event->emulate_flags )
+return true;
+
+return false;
+}
+
 /*
  * Local variables:
  * mode: C
diff --git a/xen/common/vm_event.c b/xen/common/vm_event.c
index 127f2d58f1..2df327a42c 100644
--- a/xen/common/vm_event.c
+++ b/xen/common/vm_event.c
@@ -183,6 +183,7 @@ static int vm_event_disable(struct domain *d, struct 
vm_event_domain **p_ved)
 if ( vm_event_check_ring(ved) )
 {
 struct vcpu *v;
+bool pending_op = false;
 
 spin_lock(>lock);
 
@@ -192,9 +193,6 @@ static int vm_event_disable(struct domain *d, struct 
vm_event_domain **p_ved)
 return -EBUSY;
 }
 
-/* Free domU's event channel and leave the other one unbound */
-free_xen_event_channel(d, ved->xen_port);
-
 /* Unblock all vCPUs */
 for_each_vcpu ( d, v )
 {
@@ -203,8 +201,21 @@ static int vm_event_disable(struct domain *d, struct 
vm_event_domain **p_ved)
 vcpu_unpause(v);
 ved->blocked--;
 }
+
+if ( vm_event_check_pending_op(v) )
+pending_op = true;
 }
 
+/* vm_event ops are still pending until vCPUs get scheduled */
+if ( pending_op )
+{
+spin_unlock(>lock);
+return -EAGAIN;
+}
+
+/* Free domU's event channel and leave the other one unbound */
+free_xen_event_channel(d, ved->xen_port);
+
 destroy_ring_for_helper(>ring_page, ved->ring_pg_struct);
 
 vm_event_cleanup_domain(d);
diff --git a/xen/include/asm-arm/vm_event.h b/xen/include/asm-arm/vm_event.h
index 14d1d341cc..5cbc9c6dc2 100644
--- a/xen/include/asm-arm/vm_event.h
+++ b/xen/include/asm-arm/vm_event.h
@@ -58,4 +58,11 @@ void vm_event_sync_event(struct vcpu *v, bool value)
 /* Not supported on ARM. */
 }
 
+static inline
+bool vm_event_check_pending_op(struct vcpu *v)
+{
+/* Not supported on ARM. */
+return false;
+}
+
 #endif /* __ASM_ARM_VM_EVENT_H__ */
diff --git a/xen/include/asm-x86/vm_event.h b/xen/include/asm-x86/vm_event.h
index 785e741fba..9c5ce3129c 100644
--- a/xen/include/asm-x86/vm_event.h
+++ b/xen/include/asm-x86/vm_event.h
@@ -54,4 +54,6 @@ void vm_event_emulate_check(struct vcpu *v, 
vm_event_response_t *rsp);
 
 void vm_event_sync_event(struct vcpu *v, bool value);
 
+bool vm_event_check_pending_op(struct vcpu *v);
+
 #endif /* __ASM_X86_VM_EVENT_H__ */
-- 
2.26.1

[PATCH 3/3] xen/vm_event: Add safe to disable vm_event

2020-05-15 Thread Tamas K Lengyel

Instead of having to repeatedly try to disable vm_events, request a specific
vm_event to be sent when the domain is safe to continue with shutting down
the vm_event interface.

Signed-off-by: Tamas K Lengyel 
---
 xen/arch/x86/hvm/hvm.c| 38 ++-
 xen/arch/x86/hvm/monitor.c| 14 
 xen/arch/x86/monitor.c| 13 +++
 xen/include/asm-x86/domain.h  |  1 +
 xen/include/asm-x86/hvm/monitor.h |  1 +
 xen/include/public/domctl.h   |  2 ++
 xen/include/public/vm_event.h |  8 +++
 7 files changed, 71 insertions(+), 6 deletions(-)

diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index 063f8ddc18..50c67e7b8e 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -563,15 +563,41 @@ void hvm_do_resume(struct vcpu *v)
 v->arch.hvm.inject_event.vector = HVM_EVENT_VECTOR_UNSET;
 }
 
-if ( unlikely(v->arch.vm_event) && v->arch.monitor.next_interrupt_enabled )
+if ( unlikely(v->arch.vm_event) )
 {
-struct x86_event info;
+struct domain *d = v->domain;
+
+if ( v->arch.monitor.next_interrupt_enabled )
+{
+struct x86_event info;
+
+if ( hvm_get_pending_event(v, ) )
+{
+hvm_monitor_interrupt(info.vector, info.type, info.error_code,
+  info.cr2);
+v->arch.monitor.next_interrupt_enabled = false;
+}
+}
 
-if ( hvm_get_pending_event(v, ) )
+if ( d->arch.monitor.safe_to_disable )
 {
-hvm_monitor_interrupt(info.vector, info.type, info.error_code,
-  info.cr2);
-v->arch.monitor.next_interrupt_enabled = false;
+struct vcpu *check_vcpu;
+bool pending_op = false;
+
+for_each_vcpu ( d, check_vcpu )
+{
+if ( vm_event_check_pending_op(check_vcpu) )
+{
+pending_op = true;
+break;
+}
+}
+
+if ( !pending_op )
+{
+hvm_monitor_safe_to_disable();
+d->arch.monitor.safe_to_disable = false;
+}
 }
 }
 }
diff --git a/xen/arch/x86/hvm/monitor.c b/xen/arch/x86/hvm/monitor.c
index f5d89e71d1..8e67dd1a0b 100644
--- a/xen/arch/x86/hvm/monitor.c
+++ b/xen/arch/x86/hvm/monitor.c
@@ -300,6 +300,20 @@ bool hvm_monitor_check_p2m(unsigned long gla, gfn_t gfn, 
uint32_t pfec,
 return monitor_traps(curr, true, ) >= 0;
 }
 
+bool hvm_monitor_safe_to_disable(void)
+{
+struct vcpu *curr = current;
+struct arch_domain *ad = >domain->arch;
+vm_event_request_t req = {};
+
+if ( !ad->monitor.safe_to_disable )
+return 0;
+
+req.reason = VM_EVENT_REASON_SAFE_TO_DISABLE;
+
+return monitor_traps(curr, 0, );
+}
+
 /*
  * Local variables:
  * mode: C
diff --git a/xen/arch/x86/monitor.c b/xen/arch/x86/monitor.c
index 1517a97f50..86e0ba2fbc 100644
--- a/xen/arch/x86/monitor.c
+++ b/xen/arch/x86/monitor.c
@@ -339,6 +339,19 @@ int arch_monitor_domctl_event(struct domain *d,
 break;
 }
 
+case XEN_DOMCTL_MONITOR_EVENT_SAFE_TO_DISABLE:
+{
+bool old_status = ad->monitor.safe_to_disable;
+
+if ( unlikely(old_status == requested_status) )
+return -EEXIST;
+
+domain_pause(d);
+ad->monitor.safe_to_disable = requested_status;
+domain_unpause(d);
+break;
+}
+
 default:
 /*
  * Should not be reached unless arch_monitor_get_capabilities() is
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index d890ab7a22..948b750c71 100644
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -417,6 +417,7 @@ struct arch_domain
  */
 unsigned int inguest_pagefault_disabled: 1;
 unsigned int control_register_values   : 1;
+unsigned int safe_to_disable   : 1;
 struct monitor_msr_bitmap *msr_bitmap;
 uint64_t write_ctrlreg_mask[4];
 } monitor;
diff --git a/xen/include/asm-x86/hvm/monitor.h 
b/xen/include/asm-x86/hvm/monitor.h
index 66de24cb75..194e2f857e 100644
--- a/xen/include/asm-x86/hvm/monitor.h
+++ b/xen/include/asm-x86/hvm/monitor.h
@@ -52,6 +52,7 @@ bool hvm_monitor_emul_unimplemented(void);
 
 bool hvm_monitor_check_p2m(unsigned long gla, gfn_t gfn, uint32_t pfec,
uint16_t kind);
+bool hvm_monitor_safe_to_disable(void);
 
 #endif /* __ASM_X86_HVM_MONITOR_H__ */
 
diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h
index cbcd25f12c..247e809a6c 100644
--- a/xen/include/public/domctl.h
+++ b/xen/include/public/domctl.h
@@ -1040,6 +1040,8 @@ struct xen_domctl_psr_cmt_op {
 #define XEN_D

[PATCH 1/3] xen/monitor: Control register values

2020-05-15 Thread Tamas K Lengyel

Extend the monitor_op domctl to include option that enables
controlling what values certain registers are permitted to hold
by a monitor subscriber.

Signed-off-by: Tamas K Lengyel 
---
 xen/arch/x86/hvm/hvm.c   | 31 +++
 xen/arch/x86/monitor.c   | 10 +-
 xen/include/asm-x86/domain.h |  1 +
 xen/include/public/domctl.h  |  1 +
 4 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index 814b7020d8..063f8ddc18 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -2263,9 +2263,10 @@ int hvm_set_cr0(unsigned long value, bool may_defer)
 {
 ASSERT(v->arch.vm_event);
 
-if ( hvm_monitor_crX(CR0, value, old_value) )
+if ( hvm_monitor_crX(CR0, value, old_value) &&
+ v->domain->arch.monitor.control_register_values )
 {
-/* The actual write will occur in hvm_do_resume(), if permitted. */
+/* The actual write will occur in hvm_do_resume, if permitted. */
 v->arch.vm_event->write_data.do_write.cr0 = 1;
 v->arch.vm_event->write_data.cr0 = value;
 
@@ -2362,9 +2363,10 @@ int hvm_set_cr3(unsigned long value, bool may_defer)
 {
 ASSERT(v->arch.vm_event);
 
-if ( hvm_monitor_crX(CR3, value, old) )
+if ( hvm_monitor_crX(CR3, value, old) &&
+ v->domain->arch.monitor.control_register_values )
 {
-/* The actual write will occur in hvm_do_resume(), if permitted. */
+/* The actual write will occur in hvm_do_resume, if permitted. */
 v->arch.vm_event->write_data.do_write.cr3 = 1;
 v->arch.vm_event->write_data.cr3 = value;
 
@@ -2443,9 +2445,10 @@ int hvm_set_cr4(unsigned long value, bool may_defer)
 {
 ASSERT(v->arch.vm_event);
 
-if ( hvm_monitor_crX(CR4, value, old_cr) )
+if ( hvm_monitor_crX(CR4, value, old_cr) &&
+ v->domain->arch.monitor.control_register_values )
 {
-/* The actual write will occur in hvm_do_resume(), if permitted. */
+/* The actual write will occur in hvm_do_resume, if permitted. */
 v->arch.vm_event->write_data.do_write.cr4 = 1;
 v->arch.vm_event->write_data.cr4 = value;
 
@@ -3587,13 +3590,17 @@ int hvm_msr_write_intercept(unsigned int msr, uint64_t 
msr_content,
 
 ASSERT(v->arch.vm_event);
 
-/* The actual write will occur in hvm_do_resume() (if permitted). */
-v->arch.vm_event->write_data.do_write.msr = 1;
-v->arch.vm_event->write_data.msr = msr;
-v->arch.vm_event->write_data.value = msr_content;
-
 hvm_monitor_msr(msr, msr_content, msr_old_content);
-return X86EMUL_OKAY;
+
+if ( v->domain->arch.monitor.control_register_values )
+{
+/* The actual write will occur in hvm_do_resume, if permitted. */
+v->arch.vm_event->write_data.do_write.msr = 1;
+v->arch.vm_event->write_data.msr = msr;
+v->arch.vm_event->write_data.value = msr_content;
+
+return X86EMUL_OKAY;
+}
 }
 
 if ( (ret = guest_wrmsr(v, msr, msr_content)) != X86EMUL_UNHANDLEABLE )
diff --git a/xen/arch/x86/monitor.c b/xen/arch/x86/monitor.c
index bbcb7536c7..1517a97f50 100644
--- a/xen/arch/x86/monitor.c
+++ b/xen/arch/x86/monitor.c
@@ -144,7 +144,15 @@ int arch_monitor_domctl_event(struct domain *d,
   struct xen_domctl_monitor_op *mop)
 {
 struct arch_domain *ad = >arch;
-bool requested_status = (XEN_DOMCTL_MONITOR_OP_ENABLE == mop->op);
+bool requested_status;
+
+if ( XEN_DOMCTL_MONITOR_OP_CONTROL_REGISTERS == mop->op )
+{
+ad->monitor.control_register_values = true;
+return 0;
+}
+
+requested_status = (XEN_DOMCTL_MONITOR_OP_ENABLE == mop->op);
 
 switch ( mop->event )
 {
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index 5b6d909266..d890ab7a22 100644
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -416,6 +416,7 @@ struct arch_domain
  * This is used to filter out pagefaults.
  */
 unsigned int inguest_pagefault_disabled: 1;
+unsigned int control_register_values   : 1;
 struct monitor_msr_bitmap *msr_bitmap;
 uint64_t write_ctrlreg_mask[4];
 } monitor;
diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h
index 1ad34c35eb..cbcd25f12c 100644
--- a/xen/include/public/domctl.h
+++ b/xen/include/public/domctl.h
@@ -1025,6 +1025,7 @@ struct xen_domctl_psr_cmt_op {
 #define XEN_DOMCTL_MONITOR_OP_DISABLE   1
 #define XEN_DOMCTL_MONITOR_OP_GET_CAPABILITIES  2
 #define XEN_DOMCTL_

[PATCH 0/3] vm_event: fix race-condition when disabling monitor events

2020-05-15 Thread Tamas K Lengyel

For the last couple years we have received numerous reports from users of
monitor vm_events of spurious guest crashes when using events. In particular,
it has observed that the problem occurs when vm_events are being disabled. The
nature of the guest crash varied widely and has only occured occasionally. This
made debugging the issue particularly hard. We had discussions about this issue
even here on the xen-devel mailinglist with no luck figuring it out.

The bug has now been identified as a race-condition between register event
handling and disabling the vm_event interface.

Patch 96760e2fba100d694300a81baddb5740e0f8c0ee, "vm_event: deny register writes
if refused by  vm_event reply" is the patch that introduced the error. In this
patch emulation of register write events can be postponed until the
corresponding vm_event handler decides whether to allow such write to take
place. Unfortunately this can only be implemented by performing the deny/allow
step when the vCPU gets scheduled. Due to that postponed emulation of the event
if the user decides to pause the VM in the vm_event handler and then disable
events, the entire emulation step is skipped the next time the vCPU is resumed.
Even if the user doesn't pause during the vm_event handling but exits
immediately and disables vm_event, the situation becomes racey as disabling
vm_event may succeed before the guest's vCPUs get scheduled with the pending
emulation task. This has been particularly the case with VMs that have several
vCPUs as after the VM is unpaused it may actually take a long time before all
vCPUs get scheduled.

The only solution currently is to poll each vCPU before vm_events are disabled
to verify they had been scheduled. The following patches resolve this issue in
a much nicer way.

Patch 1 adds an option to the monitor_op domctl that needs to be specified if
the user wants to actually use the postponed register-write handling
mechanism. If that option is not specified then handling is performed the
same way as before patch 96760e2fba100d694300a81baddb5740e0f8c0ee.

Patch 2 performs sanity checking when disabling vm_events to determine whether
its safe to free all vm_event structures. The vCPUs still get unpaused to
allow them to get scheduled and perform any of their pending operations,
but otherwise an -EAGAIN error is returned signaling to the user that they
need to wait and try again disabling the interface.

Patch 3 adds a vm_event specifically to signal to the user when it is safe to
continue disabling the interface.

Shout out to our friends at CERT.pl for stumbling upon a crucial piece of
information that lead to finally squashing this nasty bug.

Tamas K Lengyel (3):
  xen/monitor: Control register values
  xen/vm_event: add vm_event_check_pending_op
  xen/vm_event: Add safe to disable vm_event

 xen/arch/x86/hvm/hvm.c| 69 +++
 xen/arch/x86/hvm/monitor.c| 14 +++
 xen/arch/x86/monitor.c| 23 ++-
 xen/arch/x86/vm_event.c   | 23 +++
 xen/common/vm_event.c | 17 ++--
 xen/include/asm-arm/vm_event.h|  7 
 xen/include/asm-x86/domain.h  |  2 +
 xen/include/asm-x86/hvm/monitor.h |  1 +
 xen/include/asm-x86/vm_event.h|  2 +
 xen/include/public/domctl.h   |  3 ++
 xen/include/public/vm_event.h |  8 
 11 files changed, 147 insertions(+), 22 deletions(-)

-- 
2.26.1

Re: [PATCH] x86/hvm: Fix memory leaks in hvm_copy_context_and_params()

2020-05-18 Thread Tamas K Lengyel

On Sat, May 16, 2020 at 6:22 AM Andrew Cooper  wrote:
>
> Any error from hvm_save() or hvm_set_param() leaks the c.data allocation.
>
> Spotted by Coverity.
>
> Fixes: 353744830 "x86/hvm: introduce hvm_copy_context_and_params"
> Signed-off-by: Andrew Cooper 

Thanks for the fix, my bad!

Tamas

Re: [PATCH for-4.14 0/3] Remove the 1GB limitation on Rasberry Pi 4

2020-05-18 Thread Tamas K Lengyel

On Mon, May 18, 2020 at 5:32 AM Julien Grall  wrote:
>
> From: Julien Grall 
>
> Hi all,
>
> At the moment, a user who wants to boot Xen on the Raspberry Pi 4 can
> only use the first GB of memory.
>
> This is because several devices cannot DMA above 1GB but Xen doesn't
> necessarily allocate memory for Dom0 below 1GB.
>
> This small series is trying to address the problem by allowing a
> platform to restrict where Dom0 banks are allocated.
>
> This is also a candidate for Xen 4.14. Without it, a user will not be
> able to use all the RAM on the Raspberry Pi 4.
>
> This series has only be slighlty tested. I would appreciate more test on
> the Rasbperry Pi 4 to confirm this removing the restriction.

Hi Julien,
could you post a git branch somewhere? I can try this on my rpi4 that
already runs 4.13.

Thanks,
Tamas

Re: [PATCH v18 2/2] tools/libxl: VM forking toolstack side

2020-05-14 Thread Tamas K Lengyel

On Thu, May 14, 2020 at 8:52 AM Ian Jackson  wrote:
>
> Tamas K Lengyel writes ("[PATCH v18 2/2] tools/libxl: VM forking toolstack 
> side"):
> > Add necessary bits to implement "xl fork-vm" commands. The command allows 
> > the
> > user to specify how to launch the device model allowing for a late-launch 
> > model
> > in which the user can execute the fork without the device model and decide 
> > to
> > only later launch it.
>
> Hi.
>
> Sorry to be so late in reviewing this.  I will divert my main
> attention to the API elements...
>
> > +=item B [I] I
> > +
> > +Create a fork of a running VM.  The domain will be paused after the 
> > operation
> > +and remains paused while forks of it exist.
>
> Do you mean "must remain paused" ?  And "The original domain" rather
> than "The domain" ?

Yes, I mean the original domain.

>
> > +B
> > +
> > +=over 4
> > +
> > +=item B<-p>
> > +
> > +Leave the fork paused after creating it.
>
> By default the fork runs right away, then, I take it.

Same route is taken as when you run "xl restore" so yes. This applies
if you are launching the device model. Without the device model launch
we are not going down the same path as "xl restore" so the fork is
paused in that case.

>
> > +=item B<--launch-dm>
> > +
> > +Specify whether the device model (QEMU) should be launched for the fork. 
> > Late
> > +launch allows to start the device model for an already running fork.
>
> It's not clear to me whether this launches the DM for an existing
> fork, or specify when forking that the DM should be run ?

It's possible to do both. You can create a fork and launch the device
model for it right away, or you can create a fork, unpause it, and
only launch the device model when its actually necessary.

>
> Do you really mean that you can run a fork for a while with no DM ?
> How does that work ?
>
> Also you seem to have not documented the launch-dm operation ?

It's possible I missed it.

>
> > +=item B<-C>
> > +
> > +The config file to use when launching the device model.  Currently 
> > required when
> > +launching the device model.  Most config settings MUST match the parent 
> > domain
> > +exactly, only change VM name, disk path and network configurations.
>
> This is a libxl config file, right ?

Yes.

>
> > +=item B<-Q>
> > +
> > +The path to the qemu save file to use when launching the device model.  
> > Currently
> > +required when launching the device model.
>
> Where would the user get one of these ?

Generate it by connecting to the qmp socket of the parent domain and
issuing the command saves it. See the cover letter to the series. I
stopped sending the cover letter since there is only this one
outstanding patch now.

>
> I think this question has no good answer and this reveals a problem
> with the API...

I don't know what "problem" you are referring to. We deliberately
chose not to include saving the qemu save file every time a fork is
made because for our usecase you only need to generate the qemu save
file once. Doing it for every fork is huge waste of time since we are
spinning off forks from the same state hundreds of thousands of time.
No need to regenerate the same save file for each.

>
> > +=item B<--fork-reset>
> > +
> > +Perform a reset operation of an already running fork.  Note that resetting 
> > may
> > +be less performant then creating a new fork depending on how much memory 
> > the
> > +fork has deduplicated during its runtime.
>
> What is the semantic effect of a reset ?

I don't understand the question.

>
> > +=item B<--max-vcpus>
> > +
> > +Specify the max-vcpus matching the parent domain when not launching the dm.
>
> What ?  This makes little sense to me.  You specify vm-fork
> --max-vcpus and it changes the parent's max-vcpus ??

No. You need the max-vcpus value when you create a fork. The domain
create hypercall needs it to set the domain up. I originally wanted to
extend the domain create hypercall so this could be copied by the
hypervisor but the hypervisor maintainers were against changing that
hypercall. So we are left with having to pass it manually.

>
> > +=item B<--allow-iommu>
> > +
> > +Specify to allow forking a domain that has IOMMU enabled. Only compatible 
> > with
> > +forks using --launch-dm no.
>
> Are there no some complex implications here ?  Maybe this doc needs a
> caveat.

Only caveat is that this option is only available for forks that have
no device models launched for th

Re: [Xen-devel] [PATCH v11 2/3] x86/mem_sharing: reset a fork

2020-03-18 Thread Tamas K Lengyel

On Wed, Mar 18, 2020 at 5:36 AM Jan Beulich  wrote:
>
> On 28.02.2020 19:40, Tamas K Lengyel wrote:
> > --- a/xen/arch/x86/mm/mem_sharing.c
> > +++ b/xen/arch/x86/mm/mem_sharing.c
> > @@ -1775,6 +1775,91 @@ static int fork(struct domain *cd, struct domain *d)
> >   return rc;
> >   }
> >
> > +/*
> > + * The fork reset operation is intended to be used on short-lived forks 
> > only.
> > + */
> > +static int fork_reset(struct domain *d, struct domain *cd,
>
> Could I talk you into using pd instead of d, to even more
> clearly distinguish which of the two domain's is meant? Also
> in principle this might be possible to be a pointer to const,
> albeit I realize this may need changes you likely don't want
> to do in a prereq patch (and maybe there's actually a reason
> why it can't be).

The names c and cd are used across the mem_sharing codebase, for
consistency I'm keeping that.

>
> > +  struct mem_sharing_op_fork_reset *fr)
> > +{
> > +int rc = 0;
> > +struct p2m_domain* p2m = p2m_get_hostp2m(cd);
>
> Star and blank want to switch places here.
>
> > +struct page_info *page, *tmp;
> > +unsigned long list_position = 0, preempt_count = 0, restart = 
> > fr->opaque;
> > +
> > +domain_pause(cd);
> > +
> > +page_list_for_each_safe(page, tmp, >page_list)
>
> You may not iterate a domain's page list without holding its
> page-alloc lock. Even if the domain is paused, other entities
> (like the controlling domain) may cause the list to be altered.
> With this the question then of course becomes whether holding
> that lock for this long is acceptable. I guess you need to
> somehow mark the pages you've processed, either by a flag or
> by moving between separate lists. Domain cleanup does something
> along these lines.
>
> > +{
> > +p2m_type_t p2mt;
> > +p2m_access_t p2ma;
> > +gfn_t gfn;
> > +mfn_t mfn;
> > +bool shared = false;
> > +
> > +list_position++;
> > +
> > +/* Resume were we left of before preemption */
> > +if ( restart && list_position < restart )
> > +continue;
>
> This assumes the list to not have been changed across a continuation,
> which isn't going to fly.


OK, I'm going to drop continuation here completely. I was reluctant to
add it to begin with since this hypercall should only be called when
the number of pages is low so there wouldn't be continuation anyway.
This is work I'm unable to assign more time for, if someone in the
future really needs continuation they are welcome to figure it out.

>
> > +mfn = page_to_mfn(page);
> > +if ( mfn_valid(mfn) )
>
> All pages on a domain's list should have a valid MFN - what are you
> trying to protect against here?

I saw no documentation stating what you stated above. If that's the
case it can be dropped.

>
> > +{
> > +
> > +gfn = mfn_to_gfn(cd, mfn);
>
> Stray blank line above here?
>
> > +mfn = __get_gfn_type_access(p2m, gfn_x(gfn), , ,
> > +0, NULL, false);
> > +
> > +if ( p2m_is_ram(p2mt) && !p2m_is_shared(p2mt) )
> > +{
> > +/* take an extra reference, must work for a shared page */
>
> The comment (and also the next one further down) looks contradictory
> to the if() immediately ahead, at least to me. Could you clarify the
> situation, please?

I don't understand your question.  The comment explains exactly what
happens. Taking an extra reference must work. If it didn't, trigger an
ASSERT_UNREACHABLE. Which part is confusing?

>
> > +if( !get_page(page, cd) )
> > +{
> > +ASSERT_UNREACHABLE();
> > +return -EINVAL;
> > +}
> > +
> > +shared = true;
> > +preempt_count += 0x10;
> > +
> > +/*
> > + * Must succeed, it's a shared page that exists and
> > + * thus its size is guaranteed to be 4k so we are not 
> > splitting
> > + * large pages.
> > + */
> > +rc = p2m->set_entry(p2m, gfn, INVALID_MFN, PAGE_ORDER_4K,
> > +p2m_invalid, p2m_access_rwx, -1);
> > +ASSERT(!rc);
> > +
> > +put_page_alloc_ref(page);
> > +put_page(page);
> > +}
> > +}
> &

Re: [Xen-devel] [PATCH v11 2/3] x86/mem_sharing: reset a fork

2020-03-18 Thread Tamas K Lengyel

On Wed, Mar 18, 2020 at 8:13 AM Jan Beulich  wrote:
>
> On 18.03.2020 15:00, Tamas K Lengyel wrote:
> > On Wed, Mar 18, 2020 at 5:36 AM Jan Beulich  wrote:
> >> On 28.02.2020 19:40, Tamas K Lengyel wrote:
> >>> +mfn = page_to_mfn(page);
> >>> +if ( mfn_valid(mfn) )
> >>
> >> All pages on a domain's list should have a valid MFN - what are you
> >> trying to protect against here?
> >
> > I saw no documentation stating what you stated above. If that's the
> > case it can be dropped.
>
> Only pages coming from the allocator (or, in some special cases,
> otherwise valid) get put on a domain's page list. By coming from
> the allocator their MFNs are impicitly valid.
>
> >>> +mfn = __get_gfn_type_access(p2m, gfn_x(gfn), , ,
> >>> +0, NULL, false);
> >>> +
> >>> +if ( p2m_is_ram(p2mt) && !p2m_is_shared(p2mt) )
> >>> +{
> >>> +/* take an extra reference, must work for a shared page 
> >>> */
> >>
> >> The comment (and also the next one further down) looks contradictory
> >> to the if() immediately ahead, at least to me. Could you clarify the
> >> situation, please?
> >
> > I don't understand your question.  The comment explains exactly what
> > happens. Taking an extra reference must work. If it didn't, trigger an
> > ASSERT_UNREACHABLE. Which part is confusing?
>
> The comment says "a shared page" whereas the condition includes
> "!p2m_is_shared(p2mt)", which I understand to mean a page which is
> not shared.
>
> As to you dropping continuations again - please have at least a
> bold comment clarifying that their addition is a requirement for
> the code to ever reach "supported" status. (Any other obvious but
> intentional omissions could also be named there.)
>

Sure, I had that comment in place before. There are no plans to have
this code be "supported", we are fine with it being experimental.

Tamas

___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH] x86/mem_sharing: move mem_sharing_domain declaration

2020-03-18 Thread Tamas K Lengyel

Due to recent reshuffling of header include paths mem_sharing no longer
compiles. Fix it by moving mem_sharing_domain declaration to location it
is used in.

Signed-off-by: Tamas K Lengyel 
---
 xen/include/asm-x86/hvm/domain.h  | 13 +
 xen/include/asm-x86/mem_sharing.h | 11 ---
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/xen/include/asm-x86/hvm/domain.h b/xen/include/asm-x86/hvm/domain.h
index 624a67d0dd..95fe18cddc 100644
--- a/xen/include/asm-x86/hvm/domain.h
+++ b/xen/include/asm-x86/hvm/domain.h
@@ -64,6 +64,19 @@ struct hvm_ioreq_server {
 uint8_tbufioreq_handling;
 };
 
+#ifdef CONFIG_MEM_SHARING
+struct mem_sharing_domain
+{
+bool enabled;
+
+/*
+ * When releasing shared gfn's in a preemptible manner, recall where
+ * to resume the search.
+ */
+unsigned long next_shared_gfn_to_relinquish;
+};
+#endif
+
 /*
  * This structure defines function hooks to support hardware-assisted
  * virtual interrupt delivery to guest. (e.g. VMX PI and SVM AVIC).
diff --git a/xen/include/asm-x86/mem_sharing.h 
b/xen/include/asm-x86/mem_sharing.h
index 53760a2896..53b7929d0e 100644
--- a/xen/include/asm-x86/mem_sharing.h
+++ b/xen/include/asm-x86/mem_sharing.h
@@ -26,17 +26,6 @@
 
 #ifdef CONFIG_MEM_SHARING
 
-struct mem_sharing_domain
-{
-bool enabled;
-
-/*
- * When releasing shared gfn's in a preemptible manner, recall where
- * to resume the search.
- */
-unsigned long next_shared_gfn_to_relinquish;
-};
-
 #define mem_sharing_enabled(d) ((d)->arch.hvm.mem_sharing.enabled)
 
 /* Auditing of memory sharing code? */
-- 
2.20.1


___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH] travis: add mem_sharing compile test

2020-03-18 Thread Tamas K Lengyel

Add compile test for mem_sharing to avoid future breakage going unnoticed.

Signed-off-by: Tamas K Lengyel 
---
 .travis.yml  | 3 +++
 scripts/travis-build | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/.travis.yml b/.travis.yml
index 15ca9e9047..908d205d27 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -36,6 +36,9 @@ matrix:
   env: XEN_TARGET_ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- 
XEN_CONFIG_EXPERT=y RANDCONFIG=y debug=n
 - compiler: gcc
   env: XEN_TARGET_ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- debug=y
+- compiler: gcc
+  env: XEN_TARGET_ARCH=x86_64 XEN_CONFIG_EXPERT=y MEM_SHARING=y debug=y
+
 addons:
 apt:
 sources:
diff --git a/scripts/travis-build b/scripts/travis-build
index 0cb15a89e4..b92437e92e 100755
--- a/scripts/travis-build
+++ b/scripts/travis-build
@@ -5,6 +5,9 @@ $CC --version
 # random config or default config
 if [[ "${RANDCONFIG}" == "y" ]]; then
 make -C xen KCONFIG_ALLCONFIG=tools/kconfig/allrandom.config randconfig
+elif [[ "${MEM_SHARING}" == "y" ]]; then
+echo "CONFIG_MEM_SHARING=y" > xen/.config
+make -C xen olddefconfig
 else
 make -C xen defconfig
 fi
-- 
2.20.1


___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v11 1/3] xen/mem_sharing: VM forking

2020-03-17 Thread Tamas K Lengyel

On Tue, Mar 17, 2020 at 10:02 AM Jan Beulich  wrote:
>
> On 28.02.2020 19:40, Tamas K Lengyel wrote:
> > --- a/xen/arch/x86/mm/p2m.c
> > +++ b/xen/arch/x86/mm/p2m.c
> > @@ -509,6 +509,12 @@ mfn_t __get_gfn_type_access(struct p2m_domain *p2m, 
> > unsigned long gfn_l,
> >
> >  mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order, NULL);
> >
> > +/* Check if we need to fork the page */
> > +if ( (q & P2M_ALLOC) && p2m_is_hole(*t) &&
> > + !mem_sharing_fork_page(p2m->domain, gfn, !!(q & P2M_UNSHARE)) )
>
> No need for !! here.

I don't think it really matters but sure.

>
> > @@ -588,7 +594,8 @@ struct page_info *p2m_get_page_from_gfn(
> >  return page;
> >
> >  /* Error path: not a suitable GFN at all */
> > -if ( !p2m_is_ram(*t) && !p2m_is_paging(*t) && !p2m_is_pod(*t) )
> > +if ( !p2m_is_ram(*t) && !p2m_is_paging(*t) && !p2m_is_pod(*t) &&
> > + !mem_sharing_is_fork(p2m->domain) )
> >  return NULL;
>
> This looks pretty broad a condition, i.e. all possible types would
> make it through here for a fork. Wouldn't it make sense to limit
> to to p2m_is_hole() page types, like you check for in
> __get_gfn_type_access()?

No need to put that check here. By allowing to go further when we have
a forked VM, this code-path will call get_gfn_type_access, which does
that check. It's better to have that check at one place instead of all
over unnecessarily.

>
> > --- a/xen/common/domain.c
> > +++ b/xen/common/domain.c
> > @@ -1269,6 +1269,9 @@ int map_vcpu_info(struct vcpu *v, unsigned long gfn, 
> > unsigned offset)
> >
> >  v->vcpu_info = new_info;
> >  v->vcpu_info_mfn = page_to_mfn(page);
> > +#ifdef CONFIG_MEM_SHARING
> > +v->vcpu_info_offset = offset;
> > +#endif
>
> Seeing something like this makes me wonder whether forking shouldn't
> have its own Kconfig control.

For now I think it's fine to have it under mem_sharing.

>
> > --- a/xen/include/asm-x86/mem_sharing.h
> > +++ b/xen/include/asm-x86/mem_sharing.h
> > @@ -39,6 +39,9 @@ struct mem_sharing_domain
> >
> >  #define mem_sharing_enabled(d) ((d)->arch.hvm.mem_sharing.enabled)
> >
> > +#define mem_sharing_is_fork(d) \
> > +(mem_sharing_enabled(d) && !!((d)->parent))
>
> Again not need for !! (for a different reason).

Which is?

>
> Also, does the build break if you made this an inline function
> (as we generally prefer)?

Any particular reason for that (inline vs define)?

>
> > @@ -141,6 +148,16 @@ static inline int mem_sharing_notify_enomem(struct 
> > domain *d, unsigned long gfn,
> >  return -EOPNOTSUPP;
> >  }
> >
> > +static inline int mem_sharing_fork(struct domain *d, struct domain *cd, 
> > bool vcpu)
> > +{
> > +return -EOPNOTSUPP;
> > +}

This actually is no longer needed at all.

> > +
> > +static inline int mem_sharing_fork_page(struct domain *d, gfn_t gfn, bool 
> > lock)
> > +{
> > +return -EOPNOTSUPP;
> > +}
>
> Can these be reached? If not, please add ASSERT_UNREACHABLE().

This can be reached.

>
> > @@ -532,6 +533,10 @@ struct xen_mem_sharing_op {
> >  uint32_t gref; /* IN: gref to debug */
> >  } u;
> >  } debug;
> > +struct mem_sharing_op_fork {  /* OP_FORK */
> > +domid_t parent_domain;/* IN: parent's domain id */
> > +uint16_t _pad[3]; /* Must be set to 0 */
>
> Especially in the public interface - no new name space
> violations please. I.e. please drop the leading underscore.
> I also struggle to see why this is an array of three
> elements. In fact I don't see why the padding field would be
> needed at all - one other union member only gets padded to
> its alignment (which is what I'd expect), while others
> (presumably older ones) don't have any padding at all. Here
> there's no implicit structure's alignment padding that wants
> making explicit.

I don't know what you are asking. Drop the padding? I prefer each
union member to be padded to 64-bit, reduces cognitive load trying to
figure out what the size and alginment of each member struct would be.

>
> > --- a/xen/include/xen/sched.h
> > +++ b/xen/include/xen/sched.h
> > @@ -248,6 +248,9 @@ struct vcpu
> >
> >  /* Guest-specified relocation of vcpu_info. */
> >  mfn_tvcpu_info_mfn;
> > +#ifdef CONFIG_MEM_SHARING
> > +uint32_t vcpu_info_offset;
>
> There's no need for a fixed width type here afaics - unsigned
> int and probably even unsigned short would seem to both be
> fine.

OK.

Thanks,
Tamas

___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v11 1/3] xen/mem_sharing: VM forking

2020-03-17 Thread Tamas K Lengyel

On Tue, Mar 17, 2020 at 10:35 AM Jan Beulich  wrote:
>
> On 17.03.2020 17:23, Tamas K Lengyel wrote:
> > On Tue, Mar 17, 2020 at 10:02 AM Jan Beulich  wrote:
> >> On 28.02.2020 19:40, Tamas K Lengyel wrote:
> >>> @@ -588,7 +594,8 @@ struct page_info *p2m_get_page_from_gfn(
> >>>  return page;
> >>>
> >>>  /* Error path: not a suitable GFN at all */
> >>> -if ( !p2m_is_ram(*t) && !p2m_is_paging(*t) && !p2m_is_pod(*t) )
> >>> +if ( !p2m_is_ram(*t) && !p2m_is_paging(*t) && !p2m_is_pod(*t) &&
> >>> + !mem_sharing_is_fork(p2m->domain) )
> >>>  return NULL;
> >>
> >> This looks pretty broad a condition, i.e. all possible types would
> >> make it through here for a fork. Wouldn't it make sense to limit
> >> to to p2m_is_hole() page types, like you check for in
> >> __get_gfn_type_access()?
> >
> > No need to put that check here. By allowing to go further when we have
> > a forked VM, this code-path will call get_gfn_type_access, which does
> > that check. It's better to have that check at one place instead of all
> > over unnecessarily.
>
> Well, if worse performance (due to more cases where the lock will
> be taken) is not of concern - so be it.
>
> >>> --- a/xen/include/asm-x86/mem_sharing.h
> >>> +++ b/xen/include/asm-x86/mem_sharing.h
> >>> @@ -39,6 +39,9 @@ struct mem_sharing_domain
> >>>
> >>>  #define mem_sharing_enabled(d) ((d)->arch.hvm.mem_sharing.enabled)
> >>>
> >>> +#define mem_sharing_is_fork(d) \
> >>> +(mem_sharing_enabled(d) && !!((d)->parent))
> >>
> >> Again not need for !! (for a different reason).
> >
> > Which is?
>
> Further up the reason was that you pass the value as argument
> for a boolean function parameter. Here the reason is that is an
> operand of &&.
>
> >> Also, does the build break if you made this an inline function
> >> (as we generally prefer)?
> >
> > Any particular reason for that (inline vs define)?
>
> Inline functions add type safety for the arguments, which
> #define-s don't do.

Ack.

>
> >>> @@ -532,6 +533,10 @@ struct xen_mem_sharing_op {
> >>>  uint32_t gref; /* IN: gref to debug */
> >>>  } u;
> >>>  } debug;
> >>> +struct mem_sharing_op_fork {  /* OP_FORK */
> >>> +domid_t parent_domain;/* IN: parent's domain id */
> >>> +uint16_t _pad[3]; /* Must be set to 0 */
> >>
> >> Especially in the public interface - no new name space
> >> violations please. I.e. please drop the leading underscore.
> >> I also struggle to see why this is an array of three
> >> elements. In fact I don't see why the padding field would be
> >> needed at all - one other union member only gets padded to
> >> its alignment (which is what I'd expect), while others
> >> (presumably older ones) don't have any padding at all. Here
> >> there's no implicit structure's alignment padding that wants
> >> making explicit.
> >
> > I don't know what you are asking. Drop the padding? I prefer each
> > union member to be padded to 64-bit, reduces cognitive load trying to
> > figure out what the size and alginment of each member struct would be.
>
> Personally I'd suggest to drop the padding, as it actually
> grows the size of the structure. But if you feel strongly
> about keeping it, then I'll be okay with just the field's
> name changed.

It grows the structure size to 64-bit, yes, but it doesn't grow the
size of union as other members are much larger. I'll remove the
underscore from the pad name but I still prefer it aligned.

Tamas

___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v12 1/3] xen/mem_sharing: VM forking

2020-03-23 Thread Tamas K Lengyel

VM forking is the process of creating a domain with an empty memory space and a
parent domain specified from which to populate the memory when necessary. For
the new domain to be functional the VM state is copied over as part of the fork
operation (HVM params, hap allocation, etc).

Signed-off-by: Tamas K Lengyel 
---
v12: Minor style adjustments Jan pointed out
 Convert mem_sharing_is_fork to inline function
v11: Fully copy vcpu_info pages
 Setup vcpu_runstate for forks
 Added TODO note for PV timers
 Copy shared_info page
 Add copy_settings function, to be shared with fork_reset in the next patch
---
 xen/arch/x86/domain.c |  11 +
 xen/arch/x86/hvm/hvm.c|   4 +-
 xen/arch/x86/mm/hap/hap.c |   3 +-
 xen/arch/x86/mm/mem_sharing.c | 368 ++
 xen/arch/x86/mm/p2m.c |   9 +-
 xen/common/domain.c   |   3 +
 xen/include/asm-x86/hap.h |   1 +
 xen/include/asm-x86/hvm/hvm.h |   2 +
 xen/include/asm-x86/mem_sharing.h |  18 ++
 xen/include/public/memory.h   |   5 +
 xen/include/xen/sched.h   |   5 +
 11 files changed, 424 insertions(+), 5 deletions(-)

diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index caf2ecad7e..11d3c2216e 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -2202,6 +2202,17 @@ int domain_relinquish_resources(struct domain *d)
 ret = relinquish_shared_pages(d);
 if ( ret )
 return ret;
+
+/*
+ * If the domain is forked, decrement the parent's pause count
+ * and release the domain.
+ */
+if ( mem_sharing_is_fork(d) )
+{
+domain_unpause(d->parent);
+put_domain(d->parent);
+d->parent = NULL;
+}
 }
 #endif
 
diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index a3d115b650..304b3d1562 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -1917,7 +1917,7 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long 
gla,
 }
 #endif
 
-/* Spurious fault? PoD and log-dirty also take this path. */
+/* Spurious fault? PoD, log-dirty and VM forking also take this path. */
 if ( p2m_is_ram(p2mt) )
 {
 rc = 1;
@@ -4377,7 +4377,7 @@ static int hvm_allow_get_param(struct domain *d,
 return rc;
 }
 
-static int hvm_get_param(struct domain *d, uint32_t index, uint64_t *value)
+int hvm_get_param(struct domain *d, uint32_t index, uint64_t *value)
 {
 int rc;
 
diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c
index a6d5e39b02..814d0c3253 100644
--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -321,8 +321,7 @@ static void hap_free_p2m_page(struct domain *d, struct 
page_info *pg)
 }
 
 /* Return the size of the pool, rounded up to the nearest MB */
-static unsigned int
-hap_get_allocation(struct domain *d)
+unsigned int hap_get_allocation(struct domain *d)
 {
 unsigned int pg = d->arch.paging.hap.total_pages
 + d->arch.paging.hap.p2m_pages;
diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
index 3835bc928f..23deeddff2 100644
--- a/xen/arch/x86/mm/mem_sharing.c
+++ b/xen/arch/x86/mm/mem_sharing.c
@@ -22,6 +22,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -36,6 +37,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 #include 
 
 #include "mm-locks.h"
@@ -1444,6 +1447,334 @@ static inline int mem_sharing_control(struct domain *d, 
bool enable)
 return 0;
 }
 
+/*
+ * Forking a page only gets called when the VM faults due to no entry being
+ * in the EPT for the access. Depending on the type of access we either
+ * populate the physmap with a shared entry for read-only access or
+ * fork the page if its a write access.
+ *
+ * The client p2m is already locked so we only need to lock
+ * the parent's here.
+ */
+int mem_sharing_fork_page(struct domain *d, gfn_t gfn, bool unsharing)
+{
+int rc = -ENOENT;
+shr_handle_t handle;
+struct domain *parent = d->parent;
+struct p2m_domain *p2m;
+unsigned long gfn_l = gfn_x(gfn);
+mfn_t mfn, new_mfn;
+p2m_type_t p2mt;
+struct page_info *page;
+
+if ( !mem_sharing_is_fork(d) )
+return -ENOENT;
+
+if ( !unsharing )
+{
+/* For read-only accesses we just add a shared entry to the physmap */
+while ( parent )
+{
+if ( !(rc = nominate_page(parent, gfn, 0, )) )
+break;
+
+parent = parent->parent;
+}
+
+if ( !rc )
+{
+/* The client's p2m is already locked */
+struct p2m_domain *pp2m = p2m_get_hostp2m(parent);
+
+p2m_lock(pp2m);
+rc = add_to_physmap(parent, gfn_l, handle, d, gfn_l, false);
+p2m_unlock(pp2m);
+
+if (

[Xen-devel] [PATCH v12 3/3] xen/tools: VM forking toolstack side

2020-03-23 Thread Tamas K Lengyel

Add necessary bits to implement "xl fork-vm" commands. The command allows the
user to specify how to launch the device model allowing for a late-launch model
in which the user can execute the fork without the device model and decide to
only later launch it.

Signed-off-by: Tamas K Lengyel 
---
 docs/man/xl.1.pod.in  |  44 +
 tools/libxc/include/xenctrl.h |  13 ++
 tools/libxc/xc_memshr.c   |  22 +++
 tools/libxl/libxl.h   |  11 ++
 tools/libxl/libxl_create.c| 361 +++---
 tools/libxl/libxl_dm.c|   2 +-
 tools/libxl/libxl_dom.c   |  43 +++-
 tools/libxl/libxl_internal.h  |   7 +
 tools/libxl/libxl_types.idl   |   1 +
 tools/libxl/libxl_x86.c   |  41 
 tools/xl/Makefile |   2 +-
 tools/xl/xl.h |   5 +
 tools/xl/xl_cmdtable.c|  15 ++
 tools/xl/xl_forkvm.c  | 147 ++
 tools/xl/xl_vmcontrol.c   |  14 ++
 15 files changed, 562 insertions(+), 166 deletions(-)
 create mode 100644 tools/xl/xl_forkvm.c

diff --git a/docs/man/xl.1.pod.in b/docs/man/xl.1.pod.in
index 09339282e6..59c03c6427 100644
--- a/docs/man/xl.1.pod.in
+++ b/docs/man/xl.1.pod.in
@@ -708,6 +708,50 @@ above).
 
 =back
 
+=item B [I] I
+
+Create a fork of a running VM.  The domain will be paused after the operation
+and remains paused while forks of it exist.  Experimental and x86 only.
+Forks can only be made of domains with HAP enabled and on Intel hardware.  The
+parent domain must be created with the xl toolstack and its configuration must
+not manually define max_grant_frames, max_maptrack_frames or 
max_event_channels.
+
+B
+
+=over 4
+
+=item B<-p>
+
+Leave the fork paused after creating it.
+
+=item B<--launch-dm>
+
+Specify whether the device model (QEMU) should be launched for the fork. Late
+launch allows to start the device model for an already running fork.
+
+=item B<-C>
+
+The config file to use when launching the device model.  Currently required 
when
+launching the device model.  Most config settings MUST match the parent domain
+exactly, only change VM name, disk path and network configurations.
+
+=item B<-Q>
+
+The path to the qemu save file to use when launching the device model.  
Currently
+required when launching the device model.
+
+=item B<--fork-reset>
+
+Perform a reset operation of an already running fork.  Note that resetting may
+be less performant then creating a new fork depending on how much memory the
+fork has deduplicated during its runtime.
+
+=item B<--max-vcpus>
+
+Specify the max-vcpus matching the parent domain when not launching the dm.
+
+=back
+
 =item B [I]
 
 Display the number of shared pages for a specified domain. If no domain is
diff --git a/tools/libxc/include/xenctrl.h b/tools/libxc/include/xenctrl.h
index fc6e57a1a0..00cb4cf1f7 100644
--- a/tools/libxc/include/xenctrl.h
+++ b/tools/libxc/include/xenctrl.h
@@ -2225,6 +2225,19 @@ int xc_memshr_range_share(xc_interface *xch,
   uint64_t first_gfn,
   uint64_t last_gfn);
 
+int xc_memshr_fork(xc_interface *xch,
+   uint32_t source_domain,
+   uint32_t client_domain);
+
+/*
+ * Note: this function is only intended to be used on short-lived forks that
+ * haven't yet aquired a lot of memory. In case the fork has a lot of memory
+ * it is likely more performant to create a new fork with xc_memshr_fork.
+ *
+ * With VMs that have a lot of memory this call may block for a long time.
+ */
+int xc_memshr_fork_reset(xc_interface *xch, uint32_t forked_domain);
+
 /* Debug calls: return the number of pages referencing the shared frame backing
  * the input argument. Should be one or greater.
  *
diff --git a/tools/libxc/xc_memshr.c b/tools/libxc/xc_memshr.c
index 97e2e6a8d9..d0e4ee225b 100644
--- a/tools/libxc/xc_memshr.c
+++ b/tools/libxc/xc_memshr.c
@@ -239,6 +239,28 @@ int xc_memshr_debug_gref(xc_interface *xch,
 return xc_memshr_memop(xch, domid, );
 }
 
+int xc_memshr_fork(xc_interface *xch, uint32_t pdomid, uint32_t domid)
+{
+xen_mem_sharing_op_t mso;
+
+memset(, 0, sizeof(mso));
+
+mso.op = XENMEM_sharing_op_fork;
+mso.u.fork.parent_domain = pdomid;
+
+return xc_memshr_memop(xch, domid, );
+}
+
+int xc_memshr_fork_reset(xc_interface *xch, uint32_t domid)
+{
+xen_mem_sharing_op_t mso;
+
+memset(, 0, sizeof(mso));
+mso.op = XENMEM_sharing_op_fork_reset;
+
+return xc_memshr_memop(xch, domid, );
+}
+
 int xc_memshr_audit(xc_interface *xch)
 {
 xen_mem_sharing_op_t mso;
diff --git a/tools/libxl/libxl.h b/tools/libxl/libxl.h
index 71709dc585..088e81c78b 100644
--- a/tools/libxl/libxl.h
+++ b/tools/libxl/libxl.h
@@ -2666,6 +2666,17 @@ int libxl_psr_get_hw_info(libxl_ctx *ctx, 
libxl_psr_feat_type type,
   unsigned int lvl, unsigned int *nr,
   libxl_psr_hw_info **info);
 void libxl_psr_hw_info_list_free(lib

[Xen-devel] [PATCH v12 0/3] VM forking

2020-03-23 Thread Tamas K Lengyel

The following series implements VM forking for Intel HVM guests to allow for
the fast creation of identical VMs without the assosciated high startup costs
of booting or restoring the VM from a savefile.

JIRA issue: https://xenproject.atlassian.net/browse/XEN-89

The fork operation is implemented as part of the "xl fork-vm" command:
xl fork-vm -C  -Q  -m  

By default a fully functional fork is created. The user is in charge however to
create the appropriate config file for the fork and to generate the QEMU save
file before the fork-vm call is made. The config file needs to give the
fork a new name at minimum but other settings may also require changes. Certain
settings in the config file of both the parent and the fork have to be set to
default. Details are documented.

The interface also allows to split the forking into two steps:
xl fork-vm --launch-dm no \
   -m  \
   -p 
xl fork-vm --launch-dm late \
   -C  \
   -Q  \
   

The split creation model is useful when the VM needs to be created as fast as
possible. The forked VM can be unpaused without the device model being launched
to be monitored and accessed via VMI. Note however that without its device
model running (depending on what is executing in the VM) it is bound to
misbehave or even crash when its trying to access devices that would be
emulated by QEMU. We anticipate that for certain use-cases this would be an
acceptable situation, in case for example when fuzzing is performed of code
segments that don't access such devices.

Launching the device model requires the QEMU Xen savefile to be generated
manually from the parent VM. This can be accomplished simply by connecting to
its QMP socket and issuing the "xen-save-devices-state" command. For example
using the standard tool socat these commands can be used to generate the file:
socat - UNIX-CONNECT:/var/run/xen/qmp-libxl-
{ "execute": "qmp_capabilities" }
{ "execute": "xen-save-devices-state", \
"arguments": { "filename": "/path/to/save/qemu_state", \
"live": false} }

At runtime the forked VM starts running with an empty p2m which gets lazily
populated when the VM generates EPT faults, similar to how altp2m views are
populated. If the memory access is a read-only access, the p2m entry is
populated with a memory shared entry with its parent. For write memory accesses
or in case memory sharing wasn't possible (for example in case a reference is
held by a third party), a new page is allocated and the page contents are
copied over from the parent VM. Forks can be further forked if needed, thus
allowing for further memory savings.

A VM fork reset hypercall is also added that allows the fork to be reset to the
state it was just after a fork, also accessible via xl:
xl fork-vm --fork-reset -p 

This is an optimization for cases where the forks are very short-lived and run
without a device model, so resetting saves some time compared to creating a
brand new fork provided the fork has not aquired a lot of memory. If the fork
has a lot of memory deduplicated it is likely going to be faster to create a
new fork from scratch and asynchronously destroying the old one.

The series has been tested with Windows VMs and functions as expected. Linux
VMs when forked from a running VM will have a frozen VNC screen. Linux VMs at
this time can only be forked with a working device model when the parent VM was
restored from a snapshot using "xl restore -p". This is a known limitation.
Also note that PVHVM/PVH Linux guests have not been tested. Forking most likely
works but PV devices and drivers would require additional wiring to set things
up properly since the guests are unaware of the forking taking place, unlike
the save/restore routine where the guest is made aware of the procedure.

Forking time has been measured to be 0.0007s, device model launch to be around
1s depending largely on the number of devices being emulated. Fork resets have
been measured to be 0.0001s under the optimal circumstances.

New in v12:
style cleanups & minor adjustments
removing contiuation for fork reset and add TODO comment

Patch 1 implements the VM fork
Patch 2 implements fork reset operation
Patch 3 adds the toolstack-side code implementing VM forking and reset

Tamas K Lengyel (3):
  xen/mem_sharing: VM forking
  x86/mem_sharing: reset a fork
  xen/tools: VM forking toolstack side

 docs/man/xl.1.pod.in  |  44 +++
 tools/libxc/include/xenctrl.h |  13 +
 tools/libxc/xc_memshr.c   |  22 ++
 tools/libxl/libxl.h   |  11 +
 tools/libxl/libxl_create.c| 361 +---
 tools/libxl/libxl_dm.c|   2 +-
 tools/libxl/libxl_dom.c   |  43 ++-
 tools/libxl/libxl_internal.h  |   7 +
 tools/libxl/libxl_types.idl

[Xen-devel] [PATCH v12 2/3] x86/mem_sharing: reset a fork

2020-03-23 Thread Tamas K Lengyel

Implement hypercall that allows a fork to shed all memory that got allocated
for it during its execution and re-load its vCPU context from the parent VM.
This allows the forked VM to reset into the same state the parent VM is in a
faster way then creating a new fork would be. Measurements show about a 2x
speedup during normal fuzzing operations. Performance may vary depending how
much memory got allocated for the forked VM. If it has been completely
deduplicated from the parent VM then creating a new fork would likely be more
performant.

Signed-off-by: Tamas K Lengyel 
---
v12: remove continuation & add comment back
 address style issues pointed out by Jan
---
 xen/arch/x86/mm/mem_sharing.c | 77 +++
 xen/include/public/memory.h   |  1 +
 2 files changed, 78 insertions(+)

diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
index 23deeddff2..930a5f58ef 100644
--- a/xen/arch/x86/mm/mem_sharing.c
+++ b/xen/arch/x86/mm/mem_sharing.c
@@ -1775,6 +1775,60 @@ static int fork(struct domain *cd, struct domain *d)
 return rc;
 }
 
+/*
+ * The fork reset operation is intended to be used on short-lived forks only.
+ * There is no hypercall continuation operation implemented for this reason.
+ * For forks that obtain a larger memory footprint it is likely going to be
+ * more performant to create a new fork instead of resetting an existing one.
+ *
+ * TODO: In case this hypercall would become useful on forks with larger memory
+ * footprints the hypercall continuation should be implemented (or if this
+ * feature needs to be become "stable").
+ */
+static int mem_sharing_fork_reset(struct domain *d, struct domain *pd)
+{
+int rc;
+struct p2m_domain *p2m = p2m_get_hostp2m(d);
+struct page_info *page, *tmp;
+
+spin_lock(>page_alloc_lock);
+domain_pause(d);
+
+page_list_for_each_safe(page, tmp, >page_list)
+{
+p2m_type_t p2mt;
+p2m_access_t p2ma;
+mfn_t mfn = page_to_mfn(page);
+gfn_t gfn = mfn_to_gfn(d, mfn);
+
+mfn = __get_gfn_type_access(p2m, gfn_x(gfn), , ,
+0, NULL, false);
+
+/* only reset pages that are sharable */
+if ( !p2m_is_sharable(p2mt) )
+continue;
+
+/* take an extra reference or just skip if can't for whatever reason */
+if ( !get_page(page, d) )
+continue;
+
+/* forked memory is 4k, not splitting large pages so this must work */
+rc = p2m->set_entry(p2m, gfn, INVALID_MFN, PAGE_ORDER_4K,
+p2m_invalid, p2m_access_rwx, -1);
+ASSERT(!rc);
+
+put_page_alloc_ref(page);
+put_page(page);
+}
+
+rc = copy_settings(d, pd);
+
+domain_unpause(d);
+spin_unlock(>page_alloc_lock);
+
+return rc;
+}
+
 int mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg)
 {
 int rc;
@@ -2066,6 +2120,29 @@ int 
mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg)
 break;
 }
 
+case XENMEM_sharing_op_fork_reset:
+{
+struct domain *pd;
+
+rc = -EINVAL;
+if ( mso.u.fork.pad[0] || mso.u.fork.pad[1] ||
+ mso.u.fork.pad[2] )
+goto out;
+
+rc = -ENOSYS;
+if ( !d->parent )
+goto out;
+
+rc = rcu_lock_live_remote_domain_by_id(d->parent->domain_id, );
+if ( rc )
+goto out;
+
+rc = mem_sharing_fork_reset(d, pd);
+
+rcu_unlock_domain(pd);
+break;
+}
+
 default:
 rc = -ENOSYS;
 break;
diff --git a/xen/include/public/memory.h b/xen/include/public/memory.h
index 5ee4e0da12..d36d64b8dc 100644
--- a/xen/include/public/memory.h
+++ b/xen/include/public/memory.h
@@ -483,6 +483,7 @@ DEFINE_XEN_GUEST_HANDLE(xen_mem_access_op_t);
 #define XENMEM_sharing_op_audit 7
 #define XENMEM_sharing_op_range_share   8
 #define XENMEM_sharing_op_fork  9
+#define XENMEM_sharing_op_fork_reset10
 
 #define XENMEM_SHARING_OP_S_HANDLE_INVALID  (-10)
 #define XENMEM_SHARING_OP_C_HANDLE_INVALID  (-9)
-- 
2.20.1

Re: [Xen-devel] [PATCH] memaccess: reduce include dependencies

2020-03-09 Thread Tamas K Lengyel

On Mon, Mar 9, 2020 at 5:49 AM Jan Beulich  wrote:
>
> The common header doesn't itself need to include public/vm_event.h nor
> public/memory.h. Drop their inclusion. This requires using the non-
> typedef names in two prototypes and an inline function; by not changing
> the callers and function definitions at the same time it'll remain
> certain that the build would fail if the typedef itself was changed.
>
> Signed-off-by: Jan Beulich 
>
> --- a/xen/include/asm-arm/mem_access.h
> +++ b/xen/include/asm-arm/mem_access.h
> @@ -17,9 +17,11 @@
>  #ifndef _ASM_ARM_MEM_ACCESS_H
>  #define _ASM_ARM_MEM_ACCESS_H
>
> +struct vm_event_st;
> +
>  static inline
>  bool p2m_mem_access_emulate_check(struct vcpu *v,
> -  const vm_event_response_t *rsp)
> +  const struct vm_event_st *rsp)
>  {
>  /* Not supported on ARM. */
>  return false;
> --- a/xen/include/asm-x86/mem_access.h
> +++ b/xen/include/asm-x86/mem_access.h
> @@ -26,6 +26,8 @@
>  #ifndef __ASM_X86_MEM_ACCESS_H__
>  #define __ASM_X86_MEM_ACCESS_H__
>
> +struct vm_event_st;

Wouldn't it make more sense to define this in xen/mem_access.h instead
of having to do it in both asm versions? Nothing directly includes
asm/mem_access.h, all users include xen/mem_access.h

Tamas

___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH] memaccess: reduce include dependencies

2020-03-09 Thread Tamas K Lengyel

On Mon, Mar 9, 2020 at 10:03 AM Jan Beulich  wrote:
>
> On 09.03.2020 16:51, Tamas K Lengyel wrote:
> > On Mon, Mar 9, 2020 at 5:49 AM Jan Beulich  wrote:
> >> --- a/xen/include/asm-arm/mem_access.h
> >> +++ b/xen/include/asm-arm/mem_access.h
> >> @@ -17,9 +17,11 @@
> >>  #ifndef _ASM_ARM_MEM_ACCESS_H
> >>  #define _ASM_ARM_MEM_ACCESS_H
> >>
> >> +struct vm_event_st;
> >> +
> >>  static inline
> >>  bool p2m_mem_access_emulate_check(struct vcpu *v,
> >> -  const vm_event_response_t *rsp)
> >> +  const struct vm_event_st *rsp)
> >>  {
> >>  /* Not supported on ARM. */
> >>  return false;
> >> --- a/xen/include/asm-x86/mem_access.h
> >> +++ b/xen/include/asm-x86/mem_access.h
> >> @@ -26,6 +26,8 @@
> >>  #ifndef __ASM_X86_MEM_ACCESS_H__
> >>  #define __ASM_X86_MEM_ACCESS_H__
> >>
> >> +struct vm_event_st;
> >
> > Wouldn't it make more sense to define this in xen/mem_access.h instead
> > of having to do it in both asm versions? Nothing directly includes
> > asm/mem_access.h, all users include xen/mem_access.h
>
> If that's what you prefer - I can certainly do so. It'll look a
> little odd then, as the forward declaration has to come ahead of
>
> #include 
>
> Just let me know if you really prefer it that way.

Well, I find it ugly either way. I would prefer if it's forward
declared just at one spot, with a comment explaining why it's
needed/done that way.

Thanks,
Tamas

___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[PATCH v18 1/2] tools/libxc: add VM forking functions

2020-05-06 Thread Tamas K Lengyel

Add functions to issue VM forking hypercalls

Signed-off-by: Tamas K Lengyel 
---
 tools/libxc/include/xenctrl.h | 14 ++
 tools/libxc/xc_memshr.c   | 26 ++
 2 files changed, 40 insertions(+)

diff --git a/tools/libxc/include/xenctrl.h b/tools/libxc/include/xenctrl.h
index 5f25c5a6d4..0a6ff93229 100644
--- a/tools/libxc/include/xenctrl.h
+++ b/tools/libxc/include/xenctrl.h
@@ -2232,6 +2232,20 @@ int xc_memshr_range_share(xc_interface *xch,
   uint64_t first_gfn,
   uint64_t last_gfn);
 
+int xc_memshr_fork(xc_interface *xch,
+   uint32_t source_domain,
+   uint32_t client_domain,
+   bool allow_with_iommu);
+
+/*
+ * Note: this function is only intended to be used on short-lived forks that
+ * haven't yet aquired a lot of memory. In case the fork has a lot of memory
+ * it is likely more performant to create a new fork with xc_memshr_fork.
+ *
+ * With VMs that have a lot of memory this call may block for a long time.
+ */
+int xc_memshr_fork_reset(xc_interface *xch, uint32_t forked_domain);
+
 /* Debug calls: return the number of pages referencing the shared frame backing
  * the input argument. Should be one or greater.
  *
diff --git a/tools/libxc/xc_memshr.c b/tools/libxc/xc_memshr.c
index 97e2e6a8d9..2300cc7075 100644
--- a/tools/libxc/xc_memshr.c
+++ b/tools/libxc/xc_memshr.c
@@ -239,6 +239,32 @@ int xc_memshr_debug_gref(xc_interface *xch,
 return xc_memshr_memop(xch, domid, );
 }
 
+int xc_memshr_fork(xc_interface *xch, uint32_t pdomid, uint32_t domid,
+   bool allow_with_iommu)
+{
+xen_mem_sharing_op_t mso;
+
+memset(, 0, sizeof(mso));
+
+mso.op = XENMEM_sharing_op_fork;
+mso.u.fork.parent_domain = pdomid;
+
+if ( allow_with_iommu )
+mso.u.fork.flags |= XENMEM_FORK_WITH_IOMMU_ALLOWED;
+
+return xc_memshr_memop(xch, domid, );
+}
+
+int xc_memshr_fork_reset(xc_interface *xch, uint32_t domid)
+{
+xen_mem_sharing_op_t mso;
+
+memset(, 0, sizeof(mso));
+mso.op = XENMEM_sharing_op_fork_reset;
+
+return xc_memshr_memop(xch, domid, );
+}
+
 int xc_memshr_audit(xc_interface *xch)
 {
 xen_mem_sharing_op_t mso;
-- 
2.25.1

Re: [PATCH v17 2/2] xen/tools: VM forking toolstack side

2020-05-06 Thread Tamas K Lengyel

On Wed, May 6, 2020 at 7:00 AM Wei Liu  wrote:
>
> On Fri, May 01, 2020 at 07:59:45AM -0600, Tamas K Lengyel wrote:
> > On Thu, Apr 23, 2020 at 9:33 AM Tamas K Lengyel  
> > wrote:
> > >
> > > Add necessary bits to implement "xl fork-vm" commands. The command allows 
> > > the
> > > user to specify how to launch the device model allowing for a late-launch 
> > > model
> > > in which the user can execute the fork without the device model and 
> > > decide to
> > > only later launch it.
> > >
> > > Signed-off-by: Tamas K Lengyel 
> >
> > Patch ping. If nothing else at least the libxc parts would be nice to
> > get merged before the freeze.
>
> Changes to libxc looks good to me.
>
> Please split it out to a patch with proper commit message.
>

Sounds good, will do.

Thanks,
Tamas

[PATCH v18 2/2] tools/libxl: VM forking toolstack side

2020-05-06 Thread Tamas K Lengyel

Add necessary bits to implement "xl fork-vm" commands. The command allows the
user to specify how to launch the device model allowing for a late-launch model
in which the user can execute the fork without the device model and decide to
only later launch it.

Signed-off-by: Tamas K Lengyel 
---
 docs/man/xl.1.pod.in |  49 +
 tools/libxl/libxl.h  |  12 ++
 tools/libxl/libxl_create.c   | 359 ---
 tools/libxl/libxl_dm.c   |   2 +-
 tools/libxl/libxl_dom.c  |  43 -
 tools/libxl/libxl_internal.h |   7 +
 tools/libxl/libxl_types.idl  |   1 +
 tools/libxl/libxl_x86.c  |  42 
 tools/xl/Makefile|   2 +-
 tools/xl/xl.h|   5 +
 tools/xl/xl_cmdtable.c   |  15 ++
 tools/xl/xl_forkvm.c | 149 +++
 tools/xl/xl_vmcontrol.c  |  14 ++
 13 files changed, 535 insertions(+), 165 deletions(-)
 create mode 100644 tools/xl/xl_forkvm.c

diff --git a/docs/man/xl.1.pod.in b/docs/man/xl.1.pod.in
index 09339282e6..67b4e8588a 100644
--- a/docs/man/xl.1.pod.in
+++ b/docs/man/xl.1.pod.in
@@ -708,6 +708,55 @@ above).
 
 =back
 
+=item B [I] I
+
+Create a fork of a running VM.  The domain will be paused after the operation
+and remains paused while forks of it exist.  Experimental and x86 only.
+Forks can only be made of domains with HAP enabled and on Intel hardware.  The
+parent domain must be created with the xl toolstack and its configuration must
+not manually define max_grant_frames, max_maptrack_frames or 
max_event_channels.
+
+B
+
+=over 4
+
+=item B<-p>
+
+Leave the fork paused after creating it.
+
+=item B<--launch-dm>
+
+Specify whether the device model (QEMU) should be launched for the fork. Late
+launch allows to start the device model for an already running fork.
+
+=item B<-C>
+
+The config file to use when launching the device model.  Currently required 
when
+launching the device model.  Most config settings MUST match the parent domain
+exactly, only change VM name, disk path and network configurations.
+
+=item B<-Q>
+
+The path to the qemu save file to use when launching the device model.  
Currently
+required when launching the device model.
+
+=item B<--fork-reset>
+
+Perform a reset operation of an already running fork.  Note that resetting may
+be less performant then creating a new fork depending on how much memory the
+fork has deduplicated during its runtime.
+
+=item B<--max-vcpus>
+
+Specify the max-vcpus matching the parent domain when not launching the dm.
+
+=item B<--allow-iommu>
+
+Specify to allow forking a domain that has IOMMU enabled. Only compatible with
+forks using --launch-dm no.
+
+=back
+
 =item B [I]
 
 Display the number of shared pages for a specified domain. If no domain is
diff --git a/tools/libxl/libxl.h b/tools/libxl/libxl.h
index 71709dc585..4bbb0a773d 100644
--- a/tools/libxl/libxl.h
+++ b/tools/libxl/libxl.h
@@ -2666,6 +2666,18 @@ int libxl_psr_get_hw_info(libxl_ctx *ctx, 
libxl_psr_feat_type type,
   unsigned int lvl, unsigned int *nr,
   libxl_psr_hw_info **info);
 void libxl_psr_hw_info_list_free(libxl_psr_hw_info *list, unsigned int nr);
+
+int libxl_domain_fork_vm(libxl_ctx *ctx, uint32_t pdomid, uint32_t max_vcpus,
+ bool allow_with_iommu, uint32_t *domid)
+ LIBXL_EXTERNAL_CALLERS_ONLY;
+
+int libxl_domain_fork_launch_dm(libxl_ctx *ctx, libxl_domain_config *d_config,
+uint32_t domid,
+const libxl_asyncprogress_how *aop_console_how)
+LIBXL_EXTERNAL_CALLERS_ONLY;
+
+int libxl_domain_fork_reset(libxl_ctx *ctx, uint32_t domid)
+LIBXL_EXTERNAL_CALLERS_ONLY;
 #endif
 
 /* misc */
diff --git a/tools/libxl/libxl_create.c b/tools/libxl/libxl_create.c
index 5a043df15f..1a930c2de7 100644
--- a/tools/libxl/libxl_create.c
+++ b/tools/libxl/libxl_create.c
@@ -538,12 +538,12 @@ out:
 return ret;
 }
 
-int libxl__domain_make(libxl__gc *gc, libxl_domain_config *d_config,
-   libxl__domain_build_state *state,
-   uint32_t *domid, bool soft_reset)
+static int libxl__domain_make_xs_entries(libxl__gc *gc, libxl_domain_config 
*d_config,
+ libxl__domain_build_state *state,
+ uint32_t domid)
 {
 libxl_ctx *ctx = libxl__gc_owner(gc);
-int ret, rc, nb_vm;
+int rc, nb_vm;
 const char *dom_type;
 char *uuid_string;
 char *dom_path, *vm_path, *libxl_path;
@@ -555,9 +555,6 @@ int libxl__domain_make(libxl__gc *gc, libxl_domain_config 
*d_config,
 
 /* convenience aliases */
 libxl_domain_create_info *info = _config->c_info;
-libxl_domain_build_info *b_info = _config->b_info;
-
-assert(soft_reset || *domid == INVALID_DOMID);
 
 uuid_string =

QEMU-Xen build failure

2020-05-06 Thread Tamas K Lengyel

Hi all,
on a recent checkout of the Xen staging source I ran into the
following build error with QEMU upstream:

tools/qemu-xen-dir-remote/slirp/src/ip_input.c:330:5: error: ISO C90
forbids mixed declarations and code
[-Werror=declaration-after-statement]
 int delta = (char *)q - (m->m_flags & M_EXT ? m->m_ext : m->m_dat);

Tamas

Re: [PATCH 3/3] xen/vm_event: Add safe to disable vm_event

2020-05-20 Thread Tamas K Lengyel

On Wed, May 20, 2020 at 7:45 AM Jan Beulich  wrote:
>
> On 15.05.2020 18:53, Tamas K Lengyel wrote:
> > --- a/xen/arch/x86/hvm/hvm.c
> > +++ b/xen/arch/x86/hvm/hvm.c
> > @@ -563,15 +563,41 @@ void hvm_do_resume(struct vcpu *v)
> >  v->arch.hvm.inject_event.vector = HVM_EVENT_VECTOR_UNSET;
> >  }
> >
> > -if ( unlikely(v->arch.vm_event) && 
> > v->arch.monitor.next_interrupt_enabled )
> > +if ( unlikely(v->arch.vm_event) )
> >  {
> > -struct x86_event info;
> > +struct domain *d = v->domain;
>
> const
>
> > +if ( v->arch.monitor.next_interrupt_enabled )
> > +{
> > +struct x86_event info;
> > +
> > +if ( hvm_get_pending_event(v, ) )
> > +{
> > +hvm_monitor_interrupt(info.vector, info.type, 
> > info.error_code,
> > +  info.cr2);
> > +v->arch.monitor.next_interrupt_enabled = false;
> > +}
> > +}
> >
> > -if ( hvm_get_pending_event(v, ) )
> > +if ( d->arch.monitor.safe_to_disable )
> >  {
> > -hvm_monitor_interrupt(info.vector, info.type, info.error_code,
> > -  info.cr2);
> > -v->arch.monitor.next_interrupt_enabled = false;
> > +struct vcpu *check_vcpu;
>
> const again, requiring a respective adjustment to patch 2.
>
> > +bool pending_op = false;
> > +
> > +for_each_vcpu ( d, check_vcpu )
> > +{
> > +if ( vm_event_check_pending_op(check_vcpu) )
> > +{
> > +pending_op = true;
> > +break;
> > +}
> > +}
> > +
> > +if ( !pending_op )
> > +{
> > +hvm_monitor_safe_to_disable();
>
> This new function returns bool without the caller caring about the
> return value.

Yea, there is actually nothing to be done if the event can't be sent
for whatever reason, I guess I'll just turn it to void.

Tamas

Re: [PATCH 1/3] xen/monitor: Control register values

2020-05-20 Thread Tamas K Lengyel

On Wed, May 20, 2020 at 7:36 AM Jan Beulich  wrote:
>
> On 15.05.2020 18:53, Tamas K Lengyel wrote:
> > Extend the monitor_op domctl to include option that enables
> > controlling what values certain registers are permitted to hold
> > by a monitor subscriber.
>
> This needs a bit more explanation, especially for those of us
> who aren't that introspection savvy. For example, from the text
> here I didn't expect a simple bool control, but something where
> actual (register) values get passed back and forth.
>
> > --- a/xen/arch/x86/hvm/hvm.c
> > +++ b/xen/arch/x86/hvm/hvm.c
> > @@ -2263,9 +2263,10 @@ int hvm_set_cr0(unsigned long value, bool may_defer)
> >  {
> >  ASSERT(v->arch.vm_event);
> >
> > -if ( hvm_monitor_crX(CR0, value, old_value) )
> > +if ( hvm_monitor_crX(CR0, value, old_value) &&
> > + v->domain->arch.monitor.control_register_values )
> >  {
> > -/* The actual write will occur in hvm_do_resume(), if 
> > permitted. */
> > +/* The actual write will occur in hvm_do_resume, if permitted. 
> > */
>
> Please can you leave alone this and the similar comments below.
> And for consistency _add_ parentheses to the one new instance
> you add?

I changed to because now it doesn't fit into the 80-line limit below,
and then changed it everywhere _for_ consistency.

>
> > --- a/xen/arch/x86/monitor.c
> > +++ b/xen/arch/x86/monitor.c
> > @@ -144,7 +144,15 @@ int arch_monitor_domctl_event(struct domain *d,
> >struct xen_domctl_monitor_op *mop)
> >  {
> >  struct arch_domain *ad = >arch;
> > -bool requested_status = (XEN_DOMCTL_MONITOR_OP_ENABLE == mop->op);
> > +bool requested_status;
> > +
> > +if ( XEN_DOMCTL_MONITOR_OP_CONTROL_REGISTERS == mop->op )
> > +{
> > +ad->monitor.control_register_values = true;
>
> And there's no way to clear this flag again?

There is. Disable the monitor vm_event interface and reinitialize.

Tamas

Re: [PATCH 1/3] xen/monitor: Control register values

2020-05-20 Thread Tamas K Lengyel

On Wed, May 20, 2020 at 7:48 AM Jan Beulich  wrote:
>
> On 20.05.2020 15:42, Tamas K Lengyel wrote:
> > On Wed, May 20, 2020 at 7:36 AM Jan Beulich  wrote:
> >>
> >> On 15.05.2020 18:53, Tamas K Lengyel wrote:
> >>> Extend the monitor_op domctl to include option that enables
> >>> controlling what values certain registers are permitted to hold
> >>> by a monitor subscriber.
> >>
> >> This needs a bit more explanation, especially for those of us
> >> who aren't that introspection savvy. For example, from the text
> >> here I didn't expect a simple bool control, but something where
> >> actual (register) values get passed back and forth.
> >>
> >>> --- a/xen/arch/x86/hvm/hvm.c
> >>> +++ b/xen/arch/x86/hvm/hvm.c
> >>> @@ -2263,9 +2263,10 @@ int hvm_set_cr0(unsigned long value, bool 
> >>> may_defer)
> >>>  {
> >>>  ASSERT(v->arch.vm_event);
> >>>
> >>> -if ( hvm_monitor_crX(CR0, value, old_value) )
> >>> +if ( hvm_monitor_crX(CR0, value, old_value) &&
> >>> + v->domain->arch.monitor.control_register_values )
> >>>  {
> >>> -/* The actual write will occur in hvm_do_resume(), if 
> >>> permitted. */
> >>> +/* The actual write will occur in hvm_do_resume, if 
> >>> permitted. */
> >>
> >> Please can you leave alone this and the similar comments below.
> >> And for consistency _add_ parentheses to the one new instance
> >> you add?
> >
> > I changed to because now it doesn't fit into the 80-line limit below,
> > and then changed it everywhere _for_ consistency.
>
> The 80-char limit is easy to deal with - wrap the line.
>
> >>> --- a/xen/arch/x86/monitor.c
> >>> +++ b/xen/arch/x86/monitor.c
> >>> @@ -144,7 +144,15 @@ int arch_monitor_domctl_event(struct domain *d,
> >>>struct xen_domctl_monitor_op *mop)
> >>>  {
> >>>  struct arch_domain *ad = >arch;
> >>> -bool requested_status = (XEN_DOMCTL_MONITOR_OP_ENABLE == mop->op);
> >>> +bool requested_status;
> >>> +
> >>> +if ( XEN_DOMCTL_MONITOR_OP_CONTROL_REGISTERS == mop->op )
> >>> +{
> >>> +ad->monitor.control_register_values = true;
> >>
> >> And there's no way to clear this flag again?
> >
> > There is. Disable the monitor vm_event interface and reinitialize.
>
> Quite heavy handed, isn't it?

Not really. It's perfectly suitable for what its used for. You either
need this feature for the duration of your monitoring or you don't.
There is no in-between.

Tamas

Re: [PATCH 3/3] xen/vm_event: Add safe to disable vm_event

2020-05-20 Thread Tamas K Lengyel

On Wed, May 20, 2020 at 7:45 AM Jan Beulich  wrote:
>
> On 15.05.2020 18:53, Tamas K Lengyel wrote:
> > --- a/xen/arch/x86/hvm/hvm.c
> > +++ b/xen/arch/x86/hvm/hvm.c
> > @@ -563,15 +563,41 @@ void hvm_do_resume(struct vcpu *v)
> >  v->arch.hvm.inject_event.vector = HVM_EVENT_VECTOR_UNSET;
> >  }
> >
> > -if ( unlikely(v->arch.vm_event) && 
> > v->arch.monitor.next_interrupt_enabled )
> > +if ( unlikely(v->arch.vm_event) )
> >  {
> > -struct x86_event info;
> > +struct domain *d = v->domain;
>
> const

This can't be const, we disable the safe_to_disable option below after
sending the one-shot async event.

Tamas

[PATCH v2 for-4.14 3/3] xen/vm_event: Add safe to disable vm_event

2020-05-20 Thread Tamas K Lengyel

Instead of having to repeatedly try to disable vm_events, request a specific
vm_event to be sent when the domain is safe to continue with shutting down
the vm_event interface.

Signed-off-by: Tamas K Lengyel 
---
 xen/arch/x86/hvm/hvm.c| 38 ++-
 xen/arch/x86/hvm/monitor.c| 14 
 xen/arch/x86/monitor.c| 13 +++
 xen/include/asm-x86/domain.h  |  1 +
 xen/include/asm-x86/hvm/monitor.h |  1 +
 xen/include/public/domctl.h   |  2 ++
 xen/include/public/vm_event.h |  8 +++
 7 files changed, 71 insertions(+), 6 deletions(-)

diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index e6780c685b..fc7e1e2b22 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -563,15 +563,41 @@ void hvm_do_resume(struct vcpu *v)
 v->arch.hvm.inject_event.vector = HVM_EVENT_VECTOR_UNSET;
 }
 
-if ( unlikely(v->arch.vm_event) && v->arch.monitor.next_interrupt_enabled )
+if ( unlikely(v->arch.vm_event) )
 {
-struct x86_event info;
+struct domain *d = v->domain;
+
+if ( v->arch.monitor.next_interrupt_enabled )
+{
+struct x86_event info;
+
+if ( hvm_get_pending_event(v, ) )
+{
+hvm_monitor_interrupt(info.vector, info.type, info.error_code,
+  info.cr2);
+v->arch.monitor.next_interrupt_enabled = false;
+}
+}
 
-if ( hvm_get_pending_event(v, ) )
+if ( d->arch.monitor.safe_to_disable )
 {
-hvm_monitor_interrupt(info.vector, info.type, info.error_code,
-  info.cr2);
-v->arch.monitor.next_interrupt_enabled = false;
+const struct vcpu *check_vcpu;
+bool pending_op = false;
+
+for_each_vcpu ( d, check_vcpu )
+{
+if ( vm_event_check_pending_op(check_vcpu) )
+{
+pending_op = true;
+break;
+}
+}
+
+if ( !pending_op )
+{
+hvm_monitor_safe_to_disable();
+d->arch.monitor.safe_to_disable = false;
+}
 }
 }
 }
diff --git a/xen/arch/x86/hvm/monitor.c b/xen/arch/x86/hvm/monitor.c
index f5d89e71d1..75fd1a4b68 100644
--- a/xen/arch/x86/hvm/monitor.c
+++ b/xen/arch/x86/hvm/monitor.c
@@ -300,6 +300,20 @@ bool hvm_monitor_check_p2m(unsigned long gla, gfn_t gfn, 
uint32_t pfec,
 return monitor_traps(curr, true, ) >= 0;
 }
 
+void hvm_monitor_safe_to_disable(void)
+{
+struct vcpu *curr = current;
+struct arch_domain *ad = >domain->arch;
+vm_event_request_t req = {};
+
+if ( !ad->monitor.safe_to_disable )
+return;
+
+req.reason = VM_EVENT_REASON_SAFE_TO_DISABLE;
+
+monitor_traps(curr, 0, );
+}
+
 /*
  * Local variables:
  * mode: C
diff --git a/xen/arch/x86/monitor.c b/xen/arch/x86/monitor.c
index 1517a97f50..86e0ba2fbc 100644
--- a/xen/arch/x86/monitor.c
+++ b/xen/arch/x86/monitor.c
@@ -339,6 +339,19 @@ int arch_monitor_domctl_event(struct domain *d,
 break;
 }
 
+case XEN_DOMCTL_MONITOR_EVENT_SAFE_TO_DISABLE:
+{
+bool old_status = ad->monitor.safe_to_disable;
+
+if ( unlikely(old_status == requested_status) )
+return -EEXIST;
+
+domain_pause(d);
+ad->monitor.safe_to_disable = requested_status;
+domain_unpause(d);
+break;
+}
+
 default:
 /*
  * Should not be reached unless arch_monitor_get_capabilities() is
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index d890ab7a22..948b750c71 100644
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -417,6 +417,7 @@ struct arch_domain
  */
 unsigned int inguest_pagefault_disabled: 1;
 unsigned int control_register_values   : 1;
+unsigned int safe_to_disable   : 1;
 struct monitor_msr_bitmap *msr_bitmap;
 uint64_t write_ctrlreg_mask[4];
 } monitor;
diff --git a/xen/include/asm-x86/hvm/monitor.h 
b/xen/include/asm-x86/hvm/monitor.h
index 66de24cb75..dbc113a635 100644
--- a/xen/include/asm-x86/hvm/monitor.h
+++ b/xen/include/asm-x86/hvm/monitor.h
@@ -52,6 +52,7 @@ bool hvm_monitor_emul_unimplemented(void);
 
 bool hvm_monitor_check_p2m(unsigned long gla, gfn_t gfn, uint32_t pfec,
uint16_t kind);
+void hvm_monitor_safe_to_disable(void);
 
 #endif /* __ASM_X86_HVM_MONITOR_H__ */
 
diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h
index cbcd25f12c..247e809a6c 100644
--- a/xen/include/public/domctl.h
+++ b/xen/include/public/domctl.h
@@ -1040,6 +1040,8 @@ struct xen_domctl_psr_cmt_op {
 #define XEN_D

[PATCH v2 for-4.14 2/3] xen/vm_event: add vm_event_check_pending_op

2020-05-20 Thread Tamas K Lengyel

Perform sanity checking when shutting vm_event down to determine whether
it is safe to do so. Error out with -EAGAIN in case pending operations
have been found for the domain.

Signed-off-by: Tamas K Lengyel 
---
 xen/arch/x86/vm_event.c| 23 +++
 xen/common/vm_event.c  | 17 ++---
 xen/include/asm-arm/vm_event.h |  7 +++
 xen/include/asm-x86/vm_event.h |  2 ++
 4 files changed, 46 insertions(+), 3 deletions(-)

diff --git a/xen/arch/x86/vm_event.c b/xen/arch/x86/vm_event.c
index 848d69c1b0..a23aadc112 100644
--- a/xen/arch/x86/vm_event.c
+++ b/xen/arch/x86/vm_event.c
@@ -297,6 +297,29 @@ void vm_event_emulate_check(struct vcpu *v, 
vm_event_response_t *rsp)
 };
 }
 
+bool vm_event_check_pending_op(const struct vcpu *v)
+{
+struct monitor_write_data *w = >arch.vm_event->write_data;
+
+if ( !v->arch.vm_event->sync_event )
+return false;
+
+if ( w->do_write.cr0 )
+return true;
+if ( w->do_write.cr3 )
+return true;
+if ( w->do_write.cr4 )
+return true;
+if ( w->do_write.msr )
+return true;
+if ( v->arch.vm_event->set_gprs )
+return true;
+if ( v->arch.vm_event->emulate_flags )
+return true;
+
+return false;
+}
+
 /*
  * Local variables:
  * mode: C
diff --git a/xen/common/vm_event.c b/xen/common/vm_event.c
index 127f2d58f1..2df327a42c 100644
--- a/xen/common/vm_event.c
+++ b/xen/common/vm_event.c
@@ -183,6 +183,7 @@ static int vm_event_disable(struct domain *d, struct 
vm_event_domain **p_ved)
 if ( vm_event_check_ring(ved) )
 {
 struct vcpu *v;
+bool pending_op = false;
 
 spin_lock(>lock);
 
@@ -192,9 +193,6 @@ static int vm_event_disable(struct domain *d, struct 
vm_event_domain **p_ved)
 return -EBUSY;
 }
 
-/* Free domU's event channel and leave the other one unbound */
-free_xen_event_channel(d, ved->xen_port);
-
 /* Unblock all vCPUs */
 for_each_vcpu ( d, v )
 {
@@ -203,8 +201,21 @@ static int vm_event_disable(struct domain *d, struct 
vm_event_domain **p_ved)
 vcpu_unpause(v);
 ved->blocked--;
 }
+
+if ( vm_event_check_pending_op(v) )
+pending_op = true;
 }
 
+/* vm_event ops are still pending until vCPUs get scheduled */
+if ( pending_op )
+{
+spin_unlock(>lock);
+return -EAGAIN;
+}
+
+/* Free domU's event channel and leave the other one unbound */
+free_xen_event_channel(d, ved->xen_port);
+
 destroy_ring_for_helper(>ring_page, ved->ring_pg_struct);
 
 vm_event_cleanup_domain(d);
diff --git a/xen/include/asm-arm/vm_event.h b/xen/include/asm-arm/vm_event.h
index 14d1d341cc..978b224dc3 100644
--- a/xen/include/asm-arm/vm_event.h
+++ b/xen/include/asm-arm/vm_event.h
@@ -58,4 +58,11 @@ void vm_event_sync_event(struct vcpu *v, bool value)
 /* Not supported on ARM. */
 }
 
+static inline
+bool vm_event_check_pending_op(const struct vcpu *v)
+{
+/* Not supported on ARM. */
+return false;
+}
+
 #endif /* __ASM_ARM_VM_EVENT_H__ */
diff --git a/xen/include/asm-x86/vm_event.h b/xen/include/asm-x86/vm_event.h
index 785e741fba..97860d0d99 100644
--- a/xen/include/asm-x86/vm_event.h
+++ b/xen/include/asm-x86/vm_event.h
@@ -54,4 +54,6 @@ void vm_event_emulate_check(struct vcpu *v, 
vm_event_response_t *rsp);
 
 void vm_event_sync_event(struct vcpu *v, bool value);
 
+bool vm_event_check_pending_op(const struct vcpu *v);
+
 #endif /* __ASM_X86_VM_EVENT_H__ */
-- 
2.26.1

[PATCH v2 for-4.14 0/3] vm_event: fix race-condition when disabling monitor events

2020-05-20 Thread Tamas K Lengyel

For the last couple years we have received numerous reports from users of
monitor vm_events of spurious guest crashes when using events. In particular,
it has observed that the problem occurs when vm_events are being disabled. The
nature of the guest crash varied widely and has only occured occasionally. This
made debugging the issue particularly hard. We had discussions about this issue
even here on the xen-devel mailinglist with no luck figuring it out.

The bug has now been identified as a race-condition between register event
handling and disabling the vm_event interface.

Patch 96760e2fba100d694300a81baddb5740e0f8c0ee, "vm_event: deny register writes
if refused by  vm_event reply" is the patch that introduced the error. In this
patch emulation of register write events can be postponed until the
corresponding vm_event handler decides whether to allow such write to take
place. Unfortunately this can only be implemented by performing the deny/allow
step when the vCPU gets scheduled. Due to that postponed emulation of the event
if the user decides to pause the VM in the vm_event handler and then disable
events, the entire emulation step is skipped the next time the vCPU is resumed.
Even if the user doesn't pause during the vm_event handling but exits
immediately and disables vm_event, the situation becomes racey as disabling
vm_event may succeed before the guest's vCPUs get scheduled with the pending
emulation task. This has been particularly the case with VMS  that have several
vCPUs as after the VM is unpaused it may actually take a long time before all
vCPUs get scheduled.

The only solution currently is to poll each vCPU before vm_events are disabled
to verify they had been scheduled before it is safe to disable vm_events. The
following patches resolve this issue in a much nicer way.

Patch 1 adds an option to the monitor_op domctl that needs to be specified if
the user wants to actually use the postponed register-write handling
mechanism. If that option is not specified then handling is performed the
same way as before patch 96760e2fba100d694300a81baddb5740e0f8c0ee.

Patch 2 performs sanity checking when disabling vm_events to determine whether
its safe to free all vm_event structures. The vCPUs still get unpaused to
allow them to get scheduled and perform any of their pending operations,
but otherwise an -EAGAIN error is returned signaling to the user that they
need to wait and try again disabling the interface.

Patch 3 adds a vm_event specifically to signal to the user when it is safe to
continue disabling the interface.

Shout out to our friends at CERT.pl for stumbling upon a crucial piece of
information that lead to finally squashing this nasty bug.

v2: minor adjustments based on Jan's comments

Tamas K Lengyel (3):
  xen/monitor: Control register values
  xen/vm_event: add vm_event_check_pending_op
  xen/vm_event: Add safe to disable vm_event

 xen/arch/x86/hvm/hvm.c| 63 +++
 xen/arch/x86/hvm/monitor.c| 14 +++
 xen/arch/x86/monitor.c| 23 ++-
 xen/arch/x86/vm_event.c   | 23 +++
 xen/common/vm_event.c | 17 +++--
 xen/include/asm-arm/vm_event.h|  7 
 xen/include/asm-x86/domain.h  |  2 +
 xen/include/asm-x86/hvm/monitor.h |  1 +
 xen/include/asm-x86/vm_event.h|  2 +
 xen/include/public/domctl.h   |  3 ++
 xen/include/public/vm_event.h |  8 
 11 files changed, 144 insertions(+), 19 deletions(-)

-- 
2.26.1

[PATCH v2 for-4.14 1/3] xen/monitor: Control register values

2020-05-20 Thread Tamas K Lengyel

Extend the monitor_op domctl to include option that enables
controlling what values certain registers are permitted to hold
by a monitor subscriber.

Signed-off-by: Tamas K Lengyel 
---
 xen/arch/x86/hvm/hvm.c   | 25 -
 xen/arch/x86/monitor.c   | 10 +-
 xen/include/asm-x86/domain.h |  1 +
 xen/include/public/domctl.h  |  1 +
 4 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index 09ee299bc7..e6780c685b 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -2263,7 +2263,8 @@ int hvm_set_cr0(unsigned long value, bool may_defer)
 {
 ASSERT(v->arch.vm_event);
 
-if ( hvm_monitor_crX(CR0, value, old_value) )
+if ( hvm_monitor_crX(CR0, value, old_value) &&
+ v->domain->arch.monitor.control_register_values )
 {
 /* The actual write will occur in hvm_do_resume(), if permitted. */
 v->arch.vm_event->write_data.do_write.cr0 = 1;
@@ -2362,7 +2363,8 @@ int hvm_set_cr3(unsigned long value, bool may_defer)
 {
 ASSERT(v->arch.vm_event);
 
-if ( hvm_monitor_crX(CR3, value, old) )
+if ( hvm_monitor_crX(CR3, value, old) &&
+ v->domain->arch.monitor.control_register_values )
 {
 /* The actual write will occur in hvm_do_resume(), if permitted. */
 v->arch.vm_event->write_data.do_write.cr3 = 1;
@@ -2443,7 +2445,8 @@ int hvm_set_cr4(unsigned long value, bool may_defer)
 {
 ASSERT(v->arch.vm_event);
 
-if ( hvm_monitor_crX(CR4, value, old_cr) )
+if ( hvm_monitor_crX(CR4, value, old_cr) &&
+ v->domain->arch.monitor.control_register_values )
 {
 /* The actual write will occur in hvm_do_resume(), if permitted. */
 v->arch.vm_event->write_data.do_write.cr4 = 1;
@@ -3587,13 +3590,17 @@ int hvm_msr_write_intercept(unsigned int msr, uint64_t 
msr_content,
 
 ASSERT(v->arch.vm_event);
 
-/* The actual write will occur in hvm_do_resume() (if permitted). */
-v->arch.vm_event->write_data.do_write.msr = 1;
-v->arch.vm_event->write_data.msr = msr;
-v->arch.vm_event->write_data.value = msr_content;
-
 hvm_monitor_msr(msr, msr_content, msr_old_content);
-return X86EMUL_OKAY;
+
+if ( v->domain->arch.monitor.control_register_values )
+{
+/* The actual write will occur in hvm_do_resume(), if permitted. */
+v->arch.vm_event->write_data.do_write.msr = 1;
+v->arch.vm_event->write_data.msr = msr;
+v->arch.vm_event->write_data.value = msr_content;
+
+return X86EMUL_OKAY;
+}
 }
 
 if ( (ret = guest_wrmsr(v, msr, msr_content)) != X86EMUL_UNHANDLEABLE )
diff --git a/xen/arch/x86/monitor.c b/xen/arch/x86/monitor.c
index bbcb7536c7..1517a97f50 100644
--- a/xen/arch/x86/monitor.c
+++ b/xen/arch/x86/monitor.c
@@ -144,7 +144,15 @@ int arch_monitor_domctl_event(struct domain *d,
   struct xen_domctl_monitor_op *mop)
 {
 struct arch_domain *ad = >arch;
-bool requested_status = (XEN_DOMCTL_MONITOR_OP_ENABLE == mop->op);
+bool requested_status;
+
+if ( XEN_DOMCTL_MONITOR_OP_CONTROL_REGISTERS == mop->op )
+{
+ad->monitor.control_register_values = true;
+return 0;
+}
+
+requested_status = (XEN_DOMCTL_MONITOR_OP_ENABLE == mop->op);
 
 switch ( mop->event )
 {
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index 5b6d909266..d890ab7a22 100644
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -416,6 +416,7 @@ struct arch_domain
  * This is used to filter out pagefaults.
  */
 unsigned int inguest_pagefault_disabled: 1;
+unsigned int control_register_values   : 1;
 struct monitor_msr_bitmap *msr_bitmap;
 uint64_t write_ctrlreg_mask[4];
 } monitor;
diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h
index 1ad34c35eb..cbcd25f12c 100644
--- a/xen/include/public/domctl.h
+++ b/xen/include/public/domctl.h
@@ -1025,6 +1025,7 @@ struct xen_domctl_psr_cmt_op {
 #define XEN_DOMCTL_MONITOR_OP_DISABLE   1
 #define XEN_DOMCTL_MONITOR_OP_GET_CAPABILITIES  2
 #define XEN_DOMCTL_MONITOR_OP_EMULATE_EACH_REP  3
+#define XEN_DOMCTL_MONITOR_OP_CONTROL_REGISTERS 4
 
 #define XEN_DOMCTL_MONITOR_EVENT_WRITE_CTRLREG 0
 #define XEN_DOMCTL_MONITOR_EVENT_MOV_TO_MSR1
-- 
2.26.1

Re: [PATCH for-4.14 0/3] Remove the 1GB limitation on Rasberry Pi 4

2020-05-19 Thread Tamas K Lengyel

On Tue, May 19, 2020 at 11:23 AM Julien Grall  wrote:
>
>
>
> On 19/05/2020 04:08, Tamas K Lengyel wrote:
> > On Mon, May 18, 2020 at 5:32 AM Julien Grall  wrote:
> >>
> >> From: Julien Grall 
> >>
> >> Hi all,
> >>
> >> At the moment, a user who wants to boot Xen on the Raspberry Pi 4 can
> >> only use the first GB of memory.
> >>
> >> This is because several devices cannot DMA above 1GB but Xen doesn't
> >> necessarily allocate memory for Dom0 below 1GB.
> >>
> >> This small series is trying to address the problem by allowing a
> >> platform to restrict where Dom0 banks are allocated.
> >>
> >> This is also a candidate for Xen 4.14. Without it, a user will not be
> >> able to use all the RAM on the Raspberry Pi 4.
> >>
> >> This series has only be slighlty tested. I would appreciate more test on
> >> the Rasbperry Pi 4 to confirm this removing the restriction.
> >
> > Hi Julien,
>
> Hi,
>
> > could you post a git branch somewhere? I can try this on my rpi4 that
> > already runs 4.13.
>
> I have pushed a branch based on unstable and the v2 of the series:
>
> git://xenbits.xen.org/people/julieng/xen-unstable.git
>
> branch arm-dma/v2
>

I've updated my image I built with
https://github.com/tklengyel/xen-rpi4-builder a while ago and I've
defined 2048m as total_mem and Xen seems to be booting fine and passes
execution to dom0. With 512m being set as the Xen cmdline for dom0_mem
it was working. When I increased the mem for dom0 the boot is now
stuck at:

[1.427788] of_cfs_init
[1.429667] of_cfs_init: OK
[1.432561] clk: Not disabling unused clocks
[1.437239] Waiting for root device /dev/mmcblk0p2...
[1.451599] mmc1: queuing unknown CIS tuple 0x80 (2 bytes)
[1.458156] mmc1: queuing unknown CIS tuple 0x80 (3 bytes)
[1.464729] mmc1: queuing unknown CIS tuple 0x80 (3 bytes)
[1.472804] mmc1: queuing unknown CIS tuple 0x80 (7 bytes)
[1.479370] mmc1: queuing unknown CIS tuple 0x80 (3 bytes)
[1.546902] random: fast init done
[1.564590] mmc1: new high speed SDIO card at address 0001

Could this be because the DTB I compiled from a fresh checkout of
https://github.com/raspberrypi/linux.git branch rpi-4.19.y whereas the
kernel itself is from a checkout ~5 months ago? I guess that must be
the cause because even if I decrease the dom0_mem to 512m it still
gets stuck at the same spot whereas it was booting fine before.

Tamas

Re: [PATCH for-4.14 0/3] Remove the 1GB limitation on Rasberry Pi 4

2020-05-19 Thread Tamas K Lengyel

On Tue, May 19, 2020 at 5:50 PM Roman Shaposhnik  wrote:
>
> On Tue, May 19, 2020 at 4:44 PM Tamas K Lengyel
>  wrote:
> >
> > On Tue, May 19, 2020 at 11:23 AM Julien Grall  wrote:
> > >
> > >
> > >
> > > On 19/05/2020 04:08, Tamas K Lengyel wrote:
> > > > On Mon, May 18, 2020 at 5:32 AM Julien Grall  wrote:
> > > >>
> > > >> From: Julien Grall 
> > > >>
> > > >> Hi all,
> > > >>
> > > >> At the moment, a user who wants to boot Xen on the Raspberry Pi 4 can
> > > >> only use the first GB of memory.
> > > >>
> > > >> This is because several devices cannot DMA above 1GB but Xen doesn't
> > > >> necessarily allocate memory for Dom0 below 1GB.
> > > >>
> > > >> This small series is trying to address the problem by allowing a
> > > >> platform to restrict where Dom0 banks are allocated.
> > > >>
> > > >> This is also a candidate for Xen 4.14. Without it, a user will not be
> > > >> able to use all the RAM on the Raspberry Pi 4.
> > > >>
> > > >> This series has only be slighlty tested. I would appreciate more test 
> > > >> on
> > > >> the Rasbperry Pi 4 to confirm this removing the restriction.
> > > >
> > > > Hi Julien,
> > >
> > > Hi,
> > >
> > > > could you post a git branch somewhere? I can try this on my rpi4 that
> > > > already runs 4.13.
> > >
> > > I have pushed a branch based on unstable and the v2 of the series:
> > >
> > > git://xenbits.xen.org/people/julieng/xen-unstable.git
> > >
> > > branch arm-dma/v2
> > >
> >
> > I've updated my image I built with
> > https://github.com/tklengyel/xen-rpi4-builder a while ago and I've
> > defined 2048m as total_mem and Xen seems to be booting fine and passes
> > execution to dom0. With 512m being set as the Xen cmdline for dom0_mem
> > it was working. When I increased the mem for dom0 the boot is now
> > stuck at:
> >
> > [1.427788] of_cfs_init
> > [1.429667] of_cfs_init: OK
> > [1.432561] clk: Not disabling unused clocks
> > [1.437239] Waiting for root device /dev/mmcblk0p2...
> > [1.451599] mmc1: queuing unknown CIS tuple 0x80 (2 bytes)
> > [1.458156] mmc1: queuing unknown CIS tuple 0x80 (3 bytes)
> > [1.464729] mmc1: queuing unknown CIS tuple 0x80 (3 bytes)
> > [1.472804] mmc1: queuing unknown CIS tuple 0x80 (7 bytes)
> > [1.479370] mmc1: queuing unknown CIS tuple 0x80 (3 bytes)
> > [1.546902] random: fast init done
> > [1.564590] mmc1: new high speed SDIO card at address 0001
> >
> > Could this be because the DTB I compiled from a fresh checkout of
> > https://github.com/raspberrypi/linux.git branch rpi-4.19.y whereas the
> > kernel itself is from a checkout ~5 months ago? I guess that must be
> > the cause because even if I decrease the dom0_mem to 512m it still
> > gets stuck at the same spot whereas it was booting fine before.
>
> Stefano and I are testing the fix right now -- for now just set your
> Dom0 mem to less than 512m.

Actually seems to work after I recompiled the kernel and reinstalled
all kernel modules. Xen boots with 4gb RAM and dom0 boots with 2g:

xl info:
...
total_memory   : 3956
free_memory: 1842

cat /proc/meminfo
MemTotal:1963844 kB

I get an emergency shell during boot on the console complaining about
xenbr0 not coming up but if I just hit continue it boots fine and the
network is up. So AFAICT things are good.

Cheers,
Tamas

Re: [PATCH for-4.14 0/3] Remove the 1GB limitation on Rasberry Pi 4

2020-05-19 Thread Tamas K Lengyel

On Tue, May 19, 2020 at 8:28 PM Roman Shaposhnik  wrote:
>
> On Tue, May 19, 2020, 7:15 PM Tamas K Lengyel  
> wrote:
>>
>> On Tue, May 19, 2020 at 5:50 PM Roman Shaposhnik  wrote:
>> >
>> > On Tue, May 19, 2020 at 4:44 PM Tamas K Lengyel
>> >  wrote:
>> > >
>> > > On Tue, May 19, 2020 at 11:23 AM Julien Grall  wrote:
>> > > >
>> > > >
>> > > >
>> > > > On 19/05/2020 04:08, Tamas K Lengyel wrote:
>> > > > > On Mon, May 18, 2020 at 5:32 AM Julien Grall  wrote:
>> > > > >>
>> > > > >> From: Julien Grall 
>> > > > >>
>> > > > >> Hi all,
>> > > > >>
>> > > > >> At the moment, a user who wants to boot Xen on the Raspberry Pi 4 
>> > > > >> can
>> > > > >> only use the first GB of memory.
>> > > > >>
>> > > > >> This is because several devices cannot DMA above 1GB but Xen doesn't
>> > > > >> necessarily allocate memory for Dom0 below 1GB.
>> > > > >>
>> > > > >> This small series is trying to address the problem by allowing a
>> > > > >> platform to restrict where Dom0 banks are allocated.
>> > > > >>
>> > > > >> This is also a candidate for Xen 4.14. Without it, a user will not 
>> > > > >> be
>> > > > >> able to use all the RAM on the Raspberry Pi 4.
>> > > > >>
>> > > > >> This series has only be slighlty tested. I would appreciate more 
>> > > > >> test on
>> > > > >> the Rasbperry Pi 4 to confirm this removing the restriction.
>> > > > >
>> > > > > Hi Julien,
>> > > >
>> > > > Hi,
>> > > >
>> > > > > could you post a git branch somewhere? I can try this on my rpi4 that
>> > > > > already runs 4.13.
>> > > >
>> > > > I have pushed a branch based on unstable and the v2 of the series:
>> > > >
>> > > > git://xenbits.xen.org/people/julieng/xen-unstable.git
>> > > >
>> > > > branch arm-dma/v2
>> > > >
>> > >
>> > > I've updated my image I built with
>> > > https://github.com/tklengyel/xen-rpi4-builder a while ago and I've
>> > > defined 2048m as total_mem and Xen seems to be booting fine and passes
>> > > execution to dom0. With 512m being set as the Xen cmdline for dom0_mem
>> > > it was working. When I increased the mem for dom0 the boot is now
>> > > stuck at:
>> > >
>> > > [1.427788] of_cfs_init
>> > > [1.429667] of_cfs_init: OK
>> > > [1.432561] clk: Not disabling unused clocks
>> > > [1.437239] Waiting for root device /dev/mmcblk0p2...
>> > > [1.451599] mmc1: queuing unknown CIS tuple 0x80 (2 bytes)
>> > > [1.458156] mmc1: queuing unknown CIS tuple 0x80 (3 bytes)
>> > > [1.464729] mmc1: queuing unknown CIS tuple 0x80 (3 bytes)
>> > > [1.472804] mmc1: queuing unknown CIS tuple 0x80 (7 bytes)
>> > > [1.479370] mmc1: queuing unknown CIS tuple 0x80 (3 bytes)
>> > > [1.546902] random: fast init done
>> > > [1.564590] mmc1: new high speed SDIO card at address 0001
>> > >
>> > > Could this be because the DTB I compiled from a fresh checkout of
>> > > https://github.com/raspberrypi/linux.git branch rpi-4.19.y whereas the
>> > > kernel itself is from a checkout ~5 months ago? I guess that must be
>> > > the cause because even if I decrease the dom0_mem to 512m it still
>> > > gets stuck at the same spot whereas it was booting fine before.
>> >
>> > Stefano and I are testing the fix right now -- for now just set your
>> > Dom0 mem to less than 512m.
>>
>> Actually seems to work after I recompiled the kernel and reinstalled
>> all kernel modules. Xen boots with 4gb RAM and dom0 boots with 2g:
>>
>> xl info:
>> ...
>> total_memory   : 3956
>> free_memory: 1842
>>
>> cat /proc/meminfo
>> MemTotal:1963844 kB
>>
>> I get an emergency shell during boot on the console complaining about
>> xenbr0 not coming up but if I just hit continue it boots fine and the
>> network is up. So AFAICT things are good.
>
>
> What exact version of the kernel are you using and what did you build it from?
>
> FWIW: 5.6.x clearly has an issue with DMA.

As I said above: https://github.com/raspberrypi/linux.git branch
rpi-4.19.y, I applied the Linux patches from the xen-rpi4-builder
repo, just changing the dom0_mem option in patch 1. I reverted the
xen-rpi4-builder a couple revisions as to not build using the DTB
overlay.

Tamas

Re: [PATCH for-4.14 1/2] x86/mem_sharing: Prohibit interrupt injection for forks

2020-05-21 Thread Tamas K Lengyel

> diff --git a/xen/arch/x86/hvm/vmx/intr.c b/xen/arch/x86/hvm/vmx/intr.c
> index 000e14af49..3814795e3f 100644
> --- a/xen/arch/x86/hvm/vmx/intr.c
> +++ b/xen/arch/x86/hvm/vmx/intr.c
> @@ -256,6 +256,10 @@ void vmx_intr_assist(void)
>  if ( unlikely(v->arch.vm_event) && v->arch.vm_event->sync_event )
>  return;
>

Just noticed after sending the patch that this block needs to be wrapped in

#ifdef CONFIG_MEM_SHARING

> +/* Block event injection for VM fork if requested */
> +if ( unlikely(v->domain->arch.hvm.mem_sharing.prohibit_interrupts) )
> +return;

#endif

> +
>  /* Crank the handle on interrupt state. */
>  pt_vector = pt_update_irq(v);
>

I can resend if necessary but its also a trivial fixup when applying
so let me know what would be preferred. I pushed the fixed-up version
to 
http://xenbits.xen.org/gitweb/?p=people/tklengyel/xen.git;a=shortlog;h=refs/heads/fork_interrupts.

Thanks,
Tamas

[PATCH for-4.14 2/2] tools/libxc: xc_memshr_fork with interrupts disabled

2020-05-21 Thread Tamas K Lengyel

Toolstack side for creating forks with interrupt injection disabled.

Signed-off-by: Tamas K Lengyel 
---
 tools/libxc/include/xenctrl.h | 3 ++-
 tools/libxc/xc_memshr.c   | 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/tools/libxc/include/xenctrl.h b/tools/libxc/include/xenctrl.h
index 45ff7db1e8..0ea839b72a 100644
--- a/tools/libxc/include/xenctrl.h
+++ b/tools/libxc/include/xenctrl.h
@@ -2242,7 +2242,8 @@ int xc_memshr_range_share(xc_interface *xch,
 int xc_memshr_fork(xc_interface *xch,
uint32_t source_domain,
uint32_t client_domain,
-   bool allow_with_iommu);
+   bool allow_with_iommu,
+   bool prohibit_interrupts);
 
 /*
  * Note: this function is only intended to be used on short-lived forks that
diff --git a/tools/libxc/xc_memshr.c b/tools/libxc/xc_memshr.c
index 2300cc7075..e2de1d3aa2 100644
--- a/tools/libxc/xc_memshr.c
+++ b/tools/libxc/xc_memshr.c
@@ -240,7 +240,7 @@ int xc_memshr_debug_gref(xc_interface *xch,
 }
 
 int xc_memshr_fork(xc_interface *xch, uint32_t pdomid, uint32_t domid,
-   bool allow_with_iommu)
+   bool allow_with_iommu, bool prohibit_interrupts)
 {
 xen_mem_sharing_op_t mso;
 
@@ -251,6 +251,8 @@ int xc_memshr_fork(xc_interface *xch, uint32_t pdomid, 
uint32_t domid,
 
 if ( allow_with_iommu )
 mso.u.fork.flags |= XENMEM_FORK_WITH_IOMMU_ALLOWED;
+if ( prohibit_interrupts )
+mso.u.fork.flags |= XENMEM_FORK_PROHIBIT_INTERRUPTS;
 
 return xc_memshr_memop(xch, domid, );
 }
-- 
2.25.1

[PATCH for-4.14 1/2] x86/mem_sharing: Prohibit interrupt injection for forks

2020-05-21 Thread Tamas K Lengyel

When running shallow forks without device models it may be undesirable for Xen
to inject interrupts. With Windows forks we have observed the kernel going into
infinite loops when trying to process such interrupts. By disabling interrupt
injection the fuzzer can exercise the target code without interference.

Signed-off-by: Tamas K Lengyel 
---
 xen/arch/x86/hvm/vmx/intr.c  | 4 
 xen/arch/x86/mm/mem_sharing.c| 6 +-
 xen/include/asm-x86/hvm/domain.h | 2 ++
 xen/include/public/memory.h  | 1 +
 4 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/xen/arch/x86/hvm/vmx/intr.c b/xen/arch/x86/hvm/vmx/intr.c
index 000e14af49..3814795e3f 100644
--- a/xen/arch/x86/hvm/vmx/intr.c
+++ b/xen/arch/x86/hvm/vmx/intr.c
@@ -256,6 +256,10 @@ void vmx_intr_assist(void)
 if ( unlikely(v->arch.vm_event) && v->arch.vm_event->sync_event )
 return;
 
+/* Block event injection for VM fork if requested */
+if ( unlikely(v->domain->arch.hvm.mem_sharing.prohibit_interrupts) )
+return;
+
 /* Crank the handle on interrupt state. */
 pt_vector = pt_update_irq(v);
 
diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
index 7271e5c90b..7352fce866 100644
--- a/xen/arch/x86/mm/mem_sharing.c
+++ b/xen/arch/x86/mm/mem_sharing.c
@@ -2106,7 +2106,8 @@ int 
mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg)
 rc = -EINVAL;
 if ( mso.u.fork.pad )
 goto out;
-if ( mso.u.fork.flags & ~XENMEM_FORK_WITH_IOMMU_ALLOWED )
+if ( mso.u.fork.flags & ~(XENMEM_FORK_WITH_IOMMU_ALLOWED |
+  XENMEM_FORK_PROHIBIT_INTERRUPTS) )
 goto out;
 
 rc = rcu_lock_live_remote_domain_by_id(mso.u.fork.parent_domain,
@@ -2134,6 +2135,9 @@ int 
mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg)
 rc = hypercall_create_continuation(__HYPERVISOR_memory_op,
"lh", XENMEM_sharing_op,
arg);
+else if ( !rc && (mso.u.fork.flags & XENMEM_FORK_PROHIBIT_INTERRUPTS) )
+d->arch.hvm.mem_sharing.prohibit_interrupts = true;
+
 rcu_unlock_domain(pd);
 break;
 }
diff --git a/xen/include/asm-x86/hvm/domain.h b/xen/include/asm-x86/hvm/domain.h
index 95fe18cddc..e114f818d3 100644
--- a/xen/include/asm-x86/hvm/domain.h
+++ b/xen/include/asm-x86/hvm/domain.h
@@ -74,6 +74,8 @@ struct mem_sharing_domain
  * to resume the search.
  */
 unsigned long next_shared_gfn_to_relinquish;
+
+bool prohibit_interrupts;
 };
 #endif
 
diff --git a/xen/include/public/memory.h b/xen/include/public/memory.h
index dbd35305df..fe2e6caa68 100644
--- a/xen/include/public/memory.h
+++ b/xen/include/public/memory.h
@@ -537,6 +537,7 @@ struct xen_mem_sharing_op {
 struct mem_sharing_op_fork {  /* OP_FORK */
 domid_t parent_domain;/* IN: parent's domain id */
 #define XENMEM_FORK_WITH_IOMMU_ALLOWED (1u << 0)
+#define XENMEM_FORK_PROHIBIT_INTERRUPTS (1u << 1)
 uint16_t flags;   /* IN: optional settings */
 uint32_t pad; /* Must be set to 0 */
 } fork;
-- 
2.25.1

Re: i915 dma faults on Xen

2020-10-15 Thread Tamas K Lengyel

> > Can you paste the memory map as printed by Xen when booting, and what
> > command line are you using to boot Xen.
>
> So this is OpenXT, and it's booting EFI -> xen -> tboot -> xen

Unrelated comment: since tboot now has a PE build
(http://hg.code.sf.net/p/tboot/code/rev/5c68f0963a78) I think it would
be time for OpenXT to drop the weird efi->xen->tboot->xen flow and
just do efi->tboot->xen. Only reason we did efi->xen->tboot was
because tboot didn't have a PE build at the time. It's a very hackish
solution that's no longer needed.

Tamas

Re: [PATCH v2 3/5] x86: don't override INVALID_M2P_ENTRY with SHARED_M2P_ENTRY

2020-08-24 Thread Tamas K Lengyel

On Mon, Aug 24, 2020 at 9:06 AM Jan Beulich  wrote:
>
> On 24.08.2020 15:00, Andrew Cooper wrote:
> > On 24/08/2020 13:34, Jan Beulich wrote:
> >> While in most cases code ahead of the invocation of set_gpfn_from_mfn()
> >> deals with shared pages, at least in set_typed_p2m_entry() I can't spot
> >> such handling (it's entirely possible there's code missing there). Let's
> >> try to play safe and add an extra check.
> >>
> >> Signed-off-by: Jan Beulich 
> >
> > I agree that this is an improvement.
> >
> > Therefore, tentatively Acked-by: Andrew Cooper 
>
> Thanks, but - what do I do with a tentative ack? Take it as a "normal"
> one, or not take it at all?
>
> > However, I don't think it is legitimate for set_gpfn_from_mfn() to be
> > overriding anything.
> >
> > IMO, we should be asserting something like (pfn == SHARED_M2P_ENTRY) ==
> > (d == dom_cow).
> >
> > Any code not passing this properly is almost certainly broken anyway,
> > and fixing up behind its back like this doesn't feel like a clever idea
> > (in debug builds at least).
>
> As said on v1: I agree in principle, but I'd like such a change to be
> made by the mem-sharing maintainer(s), so we wouldn't notice fallout
> only several months or years later. Tamas - would you be up for this?

Please feel free to add that ASSERT, if it does actually catch a
situation where it doesn't hold we'll fix it when it crosses our path.
It might indeed be several months/years before we get there. Currently
no bandwidth to check manually whether it triggers anything. Having
some CI tests would help with this for sure, but currently I only
check stuff like this by hand when we get to rc's.

Tamas

Re: [PATCH v9 0/8] domain context infrastructure

2020-09-29 Thread Tamas K Lengyel

On Tue, Sep 29, 2020 at 7:54 AM Durrant, Paul  wrote:
>
> > -Original Message-
> > From: Lengyel, Tamas 
> > Sent: 28 September 2020 15:17
> > To: p...@xen.org; xen-devel@lists.xenproject.org
> > Cc: Durrant, Paul ; 'Andrew Cooper' 
> > ; 'Daniel De
> > Graaf' ; 'George Dunlap' ; 
> > 'Ian Jackson'
> > ; 'Jan Beulich' ; 'Julien 
> > Grall' ;
> > 'Marek Marczykowski-Górecki' ; 'Roger Pau 
> > Monné'
> > ; 'Stefano Stabellini' ; 
> > 'Volodymyr Babchuk'
> > ; 'Wei Liu' 
> > Subject: RE: [EXTERNAL] [PATCH v9 0/8] domain context infrastructure
> >
> > CAUTION: This email originated from outside of the organization. Do not 
> > click links or open
> > attachments unless you can confirm the sender and know the content is safe.
> >
> >
> >
> > > > Hi Paul,
> > > > Could you push a git branch somewhere for this series? I would like to
> > > > see this being integrated with VM forking and if its not too much
> > > > effort just create the patch for that so that it could be appended to 
> > > > the
> > > series.
> > > >
> > >
> > > Hi Tamas,
> > >
> > >   Done. See
> > > https://xenbits.xen.org/gitweb/?p=people/pauldu/xen.git;a=shortlog;h=refs/h
> > > eads/domain-save14
> > >
> > >   Cheers,
> > >
> > > Paul
> >
> > Hi Paul,
> > I added a small patch that would save & load the PV context from one domain 
> > to another that would be
> > called during VM forking. Please take a look at
> > https://xenbits.xen.org/gitweb/?p=people/tklengyel/xen.git;a=commitdiff;h=1843ca7302e415317fdb9a63b3a4
> > d29a385dc766;hp=8149296fdf80c73727e61cea6fe3251aecf8b333. I called the 
> > function copy_pv_domaincontext
> > for now as that seemed like the most appropriate description for it. Please 
> > let me know if this looks
> > good to you. I'm still testing it but if everything checks out it would be 
> > nice to just append this
> > patch to your series.
>
> Hi Tamas,
>
>   The code structure appears to be ok... just some cosmetic tweaks:
>
> - I think you should call the function simply 'copy_domaincontext' as the 
> idea is that all state (including what is now in hvm context) will be 
> consolidated

Sure, I wasn't entirely clear about whether this will be limited to PV
context or if it will eventually add the hvm stuff too. Right now I
still would have to do that separately.

> - The prevailing style in domctl.c AFAICS is that assignments are mostly not 
> done inside if statements. Personally I think this is a good thing.

I think it cuts down on function sizes when all that is being done
after an assigment is a NULL-check. No need for a separate line for it
but I also don't care that much. So if it's more important to whoever
maintains this to keep the style consistent in this regard I can
change it.

>
>   Once you have something ready to go then I'd be happy to tag it onto my 
> series if I need to do a v10... but I'm currently hoping that won't be 
> necessary.

I think I'll wait until HVM context is included in the framework as
well so that we can just switch over everything at once.

Tamas

Re: [PATCH v2 for-4.14 1/2] x86/mem_sharing: block interrupt injection for forks

2020-05-24 Thread Tamas K Lengyel

On Sun, May 24, 2020 at 8:33 PM Tian, Kevin  wrote:
>
> > From: Lengyel, Tamas 
> > Sent: Saturday, May 23, 2020 12:34 AM
> >
> > When running shallow forks without device models it may be undesirable for
> > Xen
>
> what is shallow forks? and why interrupt injection is not desired without
> device model? If it means just without Qemu thing, you still get local APIC
> interrupts such as timers, PMI, etc.

I refer to shallow forks as VM forks that run without a device model
(ie. QEMU). Effectively these are domains that run only with CPU and
memory, both of which are copied from the parent VM as needed. When an
interrupt is injected into a VM fork (because its state is copied from
a parent where an interrupt might be pending) the interrupt handler
might want to talk to the device model which is not present for the
fork. In such situations the VM fork ends up executing the interrupt
handler instead of the code we want to fuzz, which we want to avoid
for obvious reasons.

>
> > to inject interrupts. With Windows forks we have observed the kernel going
> > into
> > infinite loops when trying to process such interrupts, likely because it
> > attempts
>
> what is the relationship between shallow forks and windows forks then?

They are the same, but we only observed this behavior with Windows forks.

>
> > to interact with devices that are not responding without QEMU running. By
> > disabling interrupt injection the fuzzer can exercise the target code 
> > without
> > interference.
>
> what is the fuzzer?

https://github.com/intel/kernel-fuzzer-for-xen-project/

>
> >
> > Forks & memory sharing are only available on Intel CPUs so this only applies
> > to vmx.
>
> I feel lots of background is missing thus difficult to judge whether below 
> change
> is desired...

You may find the VM forking series worthwhile to review to get some
context: 
https://lists.xenproject.org/archives/html/xen-devel/2020-04/msg01162.html.
In a nutshell, it's an experimental feature geared towards fuzzing and
it's disabled by default (note that it's gated on CONFIG_MEM_SHARING
being enabled).

Tamas

Re: [PATCH v2 for-4.14 1/2] x86/mem_sharing: block interrupt injection for forks

2020-05-25 Thread Tamas K Lengyel

On Mon, May 25, 2020 at 6:18 AM Tamas K Lengyel  wrote:
>
> On Mon, May 25, 2020 at 12:06 AM Jan Beulich  wrote:
> >
> > On 22.05.2020 18:33, Tamas K Lengyel wrote:
> > > When running shallow forks without device models it may be undesirable 
> > > for Xen
> > > to inject interrupts. With Windows forks we have observed the kernel 
> > > going into
> > > infinite loops when trying to process such interrupts, likely because it 
> > > attempts
> > > to interact with devices that are not responding without QEMU running. By
> > > disabling interrupt injection the fuzzer can exercise the target code 
> > > without
> > > interference.
> > >
> > > Forks & memory sharing are only available on Intel CPUs so this only 
> > > applies
> > > to vmx.
> >
> > Looking at e.g. mem_sharing_control() I can't seem to be able to confirm
> > this. Would you mind pointing me at where this restriction is coming from?
>
> Both mem_access and mem_sharing are only implemented for EPT:
> http://xenbits.xen.org/hg/xen-unstable.hg/file/5eadf9363c25/xen/arch/x86/mm/p2m-ept.c#l126.
>
> >
> > > --- a/xen/arch/x86/hvm/vmx/intr.c
> > > +++ b/xen/arch/x86/hvm/vmx/intr.c
> > > @@ -256,6 +256,12 @@ void vmx_intr_assist(void)
> > >  if ( unlikely(v->arch.vm_event) && v->arch.vm_event->sync_event )
> > >  return;
> > >
> > > +#ifdef CONFIG_MEM_SHARING
> > > +/* Block event injection for VM fork if requested */
> > > +if ( unlikely(v->domain->arch.hvm.mem_sharing.block_interrupts) )
> > > +return;
> > > +#endif
> >
> > The two earlier returns are temporary as far as the guest is concerned,
> > i.e. eventually the interrupt(s) will get delivered. The one you add
> > looks as if it is a permanent thing, i.e. interrupt requests will pile
> > up and potentially confuse a guest down the road. This _may_ be okay
> > for your short-lived-shallow-fork scenario, but then wants at least
> > calling out in the public header by a comment (and I think the same
> > goes for XENMEM_FORK_WITH_IOMMU_ALLOWED that's already there).
>
> This is indeed only for the short-lived forks, that's why this is an
> optional flag that can be enabled when creating forks and it's not on
> by default. In that use-case the VM executes for fractions of a second
> and we want to only executes very specific code-segments with
> absolutely no interference. Interrupts in that case are just a
> nuisance that in the best case slow the fuzzing process down but as we
> observed in the worst case complete stall it.
>
> >
> > > --- a/xen/include/asm-x86/hvm/domain.h
> > > +++ b/xen/include/asm-x86/hvm/domain.h
> > > @@ -74,6 +74,8 @@ struct mem_sharing_domain
> > >   * to resume the search.
> > >   */
> > >  unsigned long next_shared_gfn_to_relinquish;
> > > +
> > > +bool block_interrupts;
> > >  };
> >
> > Please can you avoid unnecessary growth of the structure by inserting
> > next to the pre-existing bool rather than at the end?
>
> Sure. Do you want me to resend the patch for that?

I'll just resend it anyway with the requested comments in the public header.

Tamas

Re: [PATCH v2 for-4.14 1/2] x86/mem_sharing: block interrupt injection for forks

2020-05-25 Thread Tamas K Lengyel

On Mon, May 25, 2020 at 8:06 AM Jan Beulich  wrote:
>
> On 25.05.2020 15:46, Tamas K Lengyel wrote:
> > On Mon, May 25, 2020 at 7:06 AM Jan Beulich  wrote:
> >>
> >> On 25.05.2020 14:18, Tamas K Lengyel wrote:
> >>> On Mon, May 25, 2020 at 12:06 AM Jan Beulich  wrote:
> >>>>
> >>>> On 22.05.2020 18:33, Tamas K Lengyel wrote:
> >>>>> When running shallow forks without device models it may be undesirable 
> >>>>> for Xen
> >>>>> to inject interrupts. With Windows forks we have observed the kernel 
> >>>>> going into
> >>>>> infinite loops when trying to process such interrupts, likely because 
> >>>>> it attempts
> >>>>> to interact with devices that are not responding without QEMU running. 
> >>>>> By
> >>>>> disabling interrupt injection the fuzzer can exercise the target code 
> >>>>> without
> >>>>> interference.
> >>>>>
> >>>>> Forks & memory sharing are only available on Intel CPUs so this only 
> >>>>> applies
> >>>>> to vmx.
> >>>>
> >>>> Looking at e.g. mem_sharing_control() I can't seem to be able to confirm
> >>>> this. Would you mind pointing me at where this restriction is coming 
> >>>> from?
> >>>
> >>> Both mem_access and mem_sharing are only implemented for EPT:
> >>> http://xenbits.xen.org/hg/xen-unstable.hg/file/5eadf9363c25/xen/arch/x86/mm/p2m-ept.c#l126.
> >>
> >> p2m-pt.c:p2m_type_to_flags() has a similar case label.
> >
> > It doesn't do anything though, does it? For mem_sharing to work you
> > actively have to restrict the memory permissions on the shared entries
> > to be read/execute only. That's only done for EPT.
>
> Does it not? I seems to me that it does, seeing the case sits
> together with the p2m_ram_ro and p2m_ram_logdirty ones:
>
> case p2m_ram_ro:
> case p2m_ram_logdirty:
> case p2m_ram_shared:
> return flags | P2M_BASE_FLAGS;
>
> >> And I can't
> >> spot a respective restriction in mem_sharing_memop(), i.e. it looks
> >> to me as if enabling mem-sharing on NPT (to satisfy hap_enabled()
> >> in mem_sharing_control()) would be possible.
> >
> > If you are looking for an explicit gate like that, then you are right,
> > there isn't one. You can ask the original authors of this subsystem
> > why that is. If you feel like adding an extra gate, I wouldn't object.
>
> Well, the question here isn't about gating - that's an independent
> bug if it's indeed missing. The question is whether SVM code also
> needs touching, as was previously requested. You tried to address
> this by stating an Intel-only limitation, which I couldn't find
> proof for (so far).

Well, as far as I'm concerned VM forking is for Intel hardware only.
If mem_sharing seems to work for non-Intel hw - I was unaware of that
- than I'll just add an extra check for the VM fork hypercall that
gates it. It may be possible for technically be made available for
other hw as well, but at this time that's completely out-of-scope.

Tamas

Re: [PATCH v2 for-4.14 1/2] x86/mem_sharing: block interrupt injection for forks

2020-05-25 Thread Tamas K Lengyel

On Mon, May 25, 2020 at 7:06 AM Jan Beulich  wrote:
>
> On 25.05.2020 14:18, Tamas K Lengyel wrote:
> > On Mon, May 25, 2020 at 12:06 AM Jan Beulich  wrote:
> >>
> >> On 22.05.2020 18:33, Tamas K Lengyel wrote:
> >>> When running shallow forks without device models it may be undesirable 
> >>> for Xen
> >>> to inject interrupts. With Windows forks we have observed the kernel 
> >>> going into
> >>> infinite loops when trying to process such interrupts, likely because it 
> >>> attempts
> >>> to interact with devices that are not responding without QEMU running. By
> >>> disabling interrupt injection the fuzzer can exercise the target code 
> >>> without
> >>> interference.
> >>>
> >>> Forks & memory sharing are only available on Intel CPUs so this only 
> >>> applies
> >>> to vmx.
> >>
> >> Looking at e.g. mem_sharing_control() I can't seem to be able to confirm
> >> this. Would you mind pointing me at where this restriction is coming from?
> >
> > Both mem_access and mem_sharing are only implemented for EPT:
> > http://xenbits.xen.org/hg/xen-unstable.hg/file/5eadf9363c25/xen/arch/x86/mm/p2m-ept.c#l126.
>
> p2m-pt.c:p2m_type_to_flags() has a similar case label.

It doesn't do anything though, does it? For mem_sharing to work you
actively have to restrict the memory permissions on the shared entries
to be read/execute only. That's only done for EPT.

> And I can't
> spot a respective restriction in mem_sharing_memop(), i.e. it looks
> to me as if enabling mem-sharing on NPT (to satisfy hap_enabled()
> in mem_sharing_control()) would be possible.

If you are looking for an explicit gate like that, then you are right,
there isn't one. You can ask the original authors of this subsystem
why that is. If you feel like adding an extra gate, I wouldn't object.

Tamas

Re: [PATCH v2 for-4.14 1/2] x86/mem_sharing: block interrupt injection for forks

2020-05-25 Thread Tamas K Lengyel

On Mon, May 25, 2020 at 8:14 AM Tamas K Lengyel  wrote:
>
> On Mon, May 25, 2020 at 8:06 AM Jan Beulich  wrote:
> >
> > On 25.05.2020 15:46, Tamas K Lengyel wrote:
> > > On Mon, May 25, 2020 at 7:06 AM Jan Beulich  wrote:
> > >>
> > >> On 25.05.2020 14:18, Tamas K Lengyel wrote:
> > >>> On Mon, May 25, 2020 at 12:06 AM Jan Beulich  wrote:
> > >>>>
> > >>>> On 22.05.2020 18:33, Tamas K Lengyel wrote:
> > >>>>> When running shallow forks without device models it may be 
> > >>>>> undesirable for Xen
> > >>>>> to inject interrupts. With Windows forks we have observed the kernel 
> > >>>>> going into
> > >>>>> infinite loops when trying to process such interrupts, likely because 
> > >>>>> it attempts
> > >>>>> to interact with devices that are not responding without QEMU 
> > >>>>> running. By
> > >>>>> disabling interrupt injection the fuzzer can exercise the target code 
> > >>>>> without
> > >>>>> interference.
> > >>>>>
> > >>>>> Forks & memory sharing are only available on Intel CPUs so this only 
> > >>>>> applies
> > >>>>> to vmx.
> > >>>>
> > >>>> Looking at e.g. mem_sharing_control() I can't seem to be able to 
> > >>>> confirm
> > >>>> this. Would you mind pointing me at where this restriction is coming 
> > >>>> from?
> > >>>
> > >>> Both mem_access and mem_sharing are only implemented for EPT:
> > >>> http://xenbits.xen.org/hg/xen-unstable.hg/file/5eadf9363c25/xen/arch/x86/mm/p2m-ept.c#l126.
> > >>
> > >> p2m-pt.c:p2m_type_to_flags() has a similar case label.
> > >
> > > It doesn't do anything though, does it? For mem_sharing to work you
> > > actively have to restrict the memory permissions on the shared entries
> > > to be read/execute only. That's only done for EPT.
> >
> > Does it not? I seems to me that it does, seeing the case sits
> > together with the p2m_ram_ro and p2m_ram_logdirty ones:
> >
> > case p2m_ram_ro:
> > case p2m_ram_logdirty:
> > case p2m_ram_shared:
> > return flags | P2M_BASE_FLAGS;
> >
> > >> And I can't
> > >> spot a respective restriction in mem_sharing_memop(), i.e. it looks
> > >> to me as if enabling mem-sharing on NPT (to satisfy hap_enabled()
> > >> in mem_sharing_control()) would be possible.
> > >
> > > If you are looking for an explicit gate like that, then you are right,
> > > there isn't one. You can ask the original authors of this subsystem
> > > why that is. If you feel like adding an extra gate, I wouldn't object.
> >
> > Well, the question here isn't about gating - that's an independent
> > bug if it's indeed missing. The question is whether SVM code also
> > needs touching, as was previously requested. You tried to address
> > this by stating an Intel-only limitation, which I couldn't find
> > proof for (so far).
>
> Well, as far as I'm concerned VM forking is for Intel hardware only.
> If mem_sharing seems to work for non-Intel hw - I was unaware of that
> - than I'll just add an extra check for the VM fork hypercall that
> gates it. It may be possible for technically be made available for
> other hw as well, but at this time that's completely out-of-scope.

Actually, I'm going to just add that gate completely for mem_sharing.
Even if it at some time worked on other architectures (doubtful) at
this time its a usecase that's completely abandoned and forgotten and
as far as I'm concerned unmaintained with no plans from my side to
ever maintain it.

Tamas

Re: [PATCH v2 for-4.14 1/2] x86/mem_sharing: block interrupt injection for forks

2020-05-25 Thread Tamas K Lengyel

On Mon, May 25, 2020 at 12:06 AM Jan Beulich  wrote:
>
> On 22.05.2020 18:33, Tamas K Lengyel wrote:
> > When running shallow forks without device models it may be undesirable for 
> > Xen
> > to inject interrupts. With Windows forks we have observed the kernel going 
> > into
> > infinite loops when trying to process such interrupts, likely because it 
> > attempts
> > to interact with devices that are not responding without QEMU running. By
> > disabling interrupt injection the fuzzer can exercise the target code 
> > without
> > interference.
> >
> > Forks & memory sharing are only available on Intel CPUs so this only applies
> > to vmx.
>
> Looking at e.g. mem_sharing_control() I can't seem to be able to confirm
> this. Would you mind pointing me at where this restriction is coming from?

Both mem_access and mem_sharing are only implemented for EPT:
http://xenbits.xen.org/hg/xen-unstable.hg/file/5eadf9363c25/xen/arch/x86/mm/p2m-ept.c#l126.

>
> > --- a/xen/arch/x86/hvm/vmx/intr.c
> > +++ b/xen/arch/x86/hvm/vmx/intr.c
> > @@ -256,6 +256,12 @@ void vmx_intr_assist(void)
> >  if ( unlikely(v->arch.vm_event) && v->arch.vm_event->sync_event )
> >  return;
> >
> > +#ifdef CONFIG_MEM_SHARING
> > +/* Block event injection for VM fork if requested */
> > +if ( unlikely(v->domain->arch.hvm.mem_sharing.block_interrupts) )
> > +return;
> > +#endif
>
> The two earlier returns are temporary as far as the guest is concerned,
> i.e. eventually the interrupt(s) will get delivered. The one you add
> looks as if it is a permanent thing, i.e. interrupt requests will pile
> up and potentially confuse a guest down the road. This _may_ be okay
> for your short-lived-shallow-fork scenario, but then wants at least
> calling out in the public header by a comment (and I think the same
> goes for XENMEM_FORK_WITH_IOMMU_ALLOWED that's already there).

This is indeed only for the short-lived forks, that's why this is an
optional flag that can be enabled when creating forks and it's not on
by default. In that use-case the VM executes for fractions of a second
and we want to only executes very specific code-segments with
absolutely no interference. Interrupts in that case are just a
nuisance that in the best case slow the fuzzing process down but as we
observed in the worst case complete stall it.

>
> > --- a/xen/include/asm-x86/hvm/domain.h
> > +++ b/xen/include/asm-x86/hvm/domain.h
> > @@ -74,6 +74,8 @@ struct mem_sharing_domain
> >   * to resume the search.
> >   */
> >  unsigned long next_shared_gfn_to_relinquish;
> > +
> > +bool block_interrupts;
> >  };
>
> Please can you avoid unnecessary growth of the structure by inserting
> next to the pre-existing bool rather than at the end?

Sure. Do you want me to resend the patch for that?

Tamas

[PATCH] x86/mem_sharing: gate enabling on cpu_has_vmx

2020-05-25 Thread Tamas K Lengyel

From: Tamas K Lengyel 

It is unclear whether mem_sharing was ever made to work on other architectures
but at this time the only verified platform for it is vmx. No plans to support
or maintain it on other architectures. Make this explicit by checking during
initialization.

Signed-off-by: Tamas K Lengyel 
---
 xen/arch/x86/mm/mem_sharing.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
index 7271e5c90b..19922ab5d1 100644
--- a/xen/arch/x86/mm/mem_sharing.c
+++ b/xen/arch/x86/mm/mem_sharing.c
@@ -1444,7 +1444,7 @@ static inline int mem_sharing_control(struct domain *d, 
bool enable,
 {
 if ( enable )
 {
-if ( unlikely(!is_hvm_domain(d)) )
+if ( unlikely(!is_hvm_domain(d) || !cpu_has_vmx) )
 return -EOPNOTSUPP;
 
 if ( unlikely(!hap_enabled(d)) )
-- 
2.26.1

[PATCH v3 for-4.14 1/2] x86/mem_sharing: block interrupt injection for forks

2020-05-25 Thread Tamas K Lengyel

When running shallow forks, ie. VM forks without device models (QEMU), it may
be undesirable for Xen to inject interrupts. When creating such forks from
Windows VMs we have observed the kernel trying to process interrupts
immediately after the fork is executed. However without QEMU running such
interrupt handling may not be possible because it may attempt to interact with
devices that are not emulated by a backend. In the best case scenario such
interrupt handling would only present a detour in the VM forks' execution
flow, but in the worst case as we actually observed can completely stall it.
By disabling interrupt injection a fuzzer can exercise the target code without
interference. For other use-cases this option probably doesn't make sense,
that's why this is not enabled by default.

Forks & memory sharing are only available on Intel CPUs so this only applies
to vmx. Note that this is part of the experimental VM forking feature that's
completely disabled by default and can only be enabled by using
XEN_CONFIG_EXPERT during compile time.

Signed-off-by: Tamas K Lengyel 
---
v3: add comments in the public header how this option only makes sense for
 short lived forks
minor style adjustment
v2: prohibit => block
minor style adjustments
---
 xen/arch/x86/hvm/vmx/intr.c  | 6 ++
 xen/arch/x86/mm/mem_sharing.c| 6 +-
 xen/include/asm-x86/hvm/domain.h | 2 +-
 xen/include/public/memory.h  | 3 +++
 4 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/xen/arch/x86/hvm/vmx/intr.c b/xen/arch/x86/hvm/vmx/intr.c
index 000e14af49..80bfbb4787 100644
--- a/xen/arch/x86/hvm/vmx/intr.c
+++ b/xen/arch/x86/hvm/vmx/intr.c
@@ -256,6 +256,12 @@ void vmx_intr_assist(void)
 if ( unlikely(v->arch.vm_event) && v->arch.vm_event->sync_event )
 return;
 
+#ifdef CONFIG_MEM_SHARING
+/* Block event injection for VM fork if requested */
+if ( unlikely(v->domain->arch.hvm.mem_sharing.block_interrupts) )
+return;
+#endif
+
 /* Crank the handle on interrupt state. */
 pt_vector = pt_update_irq(v);
 
diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
index 7271e5c90b..0c45a8d67e 100644
--- a/xen/arch/x86/mm/mem_sharing.c
+++ b/xen/arch/x86/mm/mem_sharing.c
@@ -2106,7 +2106,8 @@ int 
mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg)
 rc = -EINVAL;
 if ( mso.u.fork.pad )
 goto out;
-if ( mso.u.fork.flags & ~XENMEM_FORK_WITH_IOMMU_ALLOWED )
+if ( mso.u.fork.flags &
+ ~(XENMEM_FORK_WITH_IOMMU_ALLOWED | XENMEM_FORK_BLOCK_INTERRUPTS) )
 goto out;
 
 rc = rcu_lock_live_remote_domain_by_id(mso.u.fork.parent_domain,
@@ -2134,6 +2135,9 @@ int 
mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg)
 rc = hypercall_create_continuation(__HYPERVISOR_memory_op,
"lh", XENMEM_sharing_op,
arg);
+else if ( !rc && (mso.u.fork.flags & XENMEM_FORK_BLOCK_INTERRUPTS) )
+d->arch.hvm.mem_sharing.block_interrupts = true;
+
 rcu_unlock_domain(pd);
 break;
 }
diff --git a/xen/include/asm-x86/hvm/domain.h b/xen/include/asm-x86/hvm/domain.h
index 95fe18cddc..9d247baf4d 100644
--- a/xen/include/asm-x86/hvm/domain.h
+++ b/xen/include/asm-x86/hvm/domain.h
@@ -67,7 +67,7 @@ struct hvm_ioreq_server {
 #ifdef CONFIG_MEM_SHARING
 struct mem_sharing_domain
 {
-bool enabled;
+bool enabled, block_interrupts;
 
 /*
  * When releasing shared gfn's in a preemptible manner, recall where
diff --git a/xen/include/public/memory.h b/xen/include/public/memory.h
index dbd35305df..850bd72c52 100644
--- a/xen/include/public/memory.h
+++ b/xen/include/public/memory.h
@@ -536,7 +536,10 @@ struct xen_mem_sharing_op {
 } debug;
 struct mem_sharing_op_fork {  /* OP_FORK */
 domid_t parent_domain;/* IN: parent's domain id */
+/* Only makes sense for short-lived forks */
 #define XENMEM_FORK_WITH_IOMMU_ALLOWED (1u << 0)
+/* Only makes sense for short-lived forks */
+#define XENMEM_FORK_BLOCK_INTERRUPTS   (1u << 1)
 uint16_t flags;   /* IN: optional settings */
 uint32_t pad; /* Must be set to 0 */
 } fork;
-- 
2.25.1

[PATCH v3 for-4.14 2/2] tools/libxc: xc_memshr_fork with interrupts blocked

2020-05-25 Thread Tamas K Lengyel

Toolstack side for creating forks with interrupt injection blocked.

Signed-off-by: Tamas K Lengyel 
Reviewed-by: Roger Pau Monné 
Acked-by: Ian Jackson 
---
 tools/libxc/include/xenctrl.h | 3 ++-
 tools/libxc/xc_memshr.c   | 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/tools/libxc/include/xenctrl.h b/tools/libxc/include/xenctrl.h
index 45ff7db1e8..804ff001d7 100644
--- a/tools/libxc/include/xenctrl.h
+++ b/tools/libxc/include/xenctrl.h
@@ -2242,7 +2242,8 @@ int xc_memshr_range_share(xc_interface *xch,
 int xc_memshr_fork(xc_interface *xch,
uint32_t source_domain,
uint32_t client_domain,
-   bool allow_with_iommu);
+   bool allow_with_iommu,
+   bool block_interrupts);
 
 /*
  * Note: this function is only intended to be used on short-lived forks that
diff --git a/tools/libxc/xc_memshr.c b/tools/libxc/xc_memshr.c
index 2300cc7075..a6cfd7dccf 100644
--- a/tools/libxc/xc_memshr.c
+++ b/tools/libxc/xc_memshr.c
@@ -240,7 +240,7 @@ int xc_memshr_debug_gref(xc_interface *xch,
 }
 
 int xc_memshr_fork(xc_interface *xch, uint32_t pdomid, uint32_t domid,
-   bool allow_with_iommu)
+   bool allow_with_iommu, bool block_interrupts)
 {
 xen_mem_sharing_op_t mso;
 
@@ -251,6 +251,8 @@ int xc_memshr_fork(xc_interface *xch, uint32_t pdomid, 
uint32_t domid,
 
 if ( allow_with_iommu )
 mso.u.fork.flags |= XENMEM_FORK_WITH_IOMMU_ALLOWED;
+if ( block_interrupts )
+mso.u.fork.flags |= XENMEM_FORK_BLOCK_INTERRUPTS;
 
 return xc_memshr_memop(xch, domid, );
 }
-- 
2.25.1

Re: [BUG] Core scheduling patches causing deadlock in some situations

2020-05-29 Thread Tamas K Lengyel

On Fri, May 29, 2020 at 7:51 AM Michał Leszczyński
 wrote:
>
> - 29 maj 2020 o 15:15, Jürgen Groß jgr...@suse.com napisał(a):
>
> > On 29.05.20 14:51, Michał Leszczyński wrote:
> >> - 29 maj 2020 o 14:44, Jürgen Groß jgr...@suse.com napisał(a):
> >>
> >>> On 29.05.20 14:30, Michał Leszczyński wrote:
>  Hello,
> 
>  I'm running DRAKVUF on Dell Inc. PowerEdge R640/08HT8T server with 
>  Intel(R)
>  Xeon(R) Gold 6132 CPU @ 2.60GHz CPU.
>  When upgrading from Xen RELEASE 4.12 to 4.13, we have noticed some 
>  stability
>  problems concerning freezes of Dom0 (Debian Buster):
> 
>  ---
> 
>  maj 27 23:17:02 debian kernel: rcu: INFO: rcu_sched self-detected stall 
>  on CPU
>  maj 27 23:17:02 debian kernel: rcu: 0-: (5250 ticks this GP)
>  idle=cee/1/0x4002 softirq=11964/11964 fqs=2515
>  maj 27 23:17:02 debian kernel: rcu: (t=5251 jiffies g=27237 q=799)
>  maj 27 23:17:02 debian kernel: NMI backtrace for cpu 0
>  maj 27 23:17:02 debian kernel: CPU: 0 PID: 643 Comm: z_rd_int_1 Tainted: 
>  P OE
>  4.19.0-6-amd64 #1 Debian 4.19.67-2+deb10u2
>  maj 27 23:17:02 debian kernel: Hardware name: Dell Inc. PowerEdge 
>  R640/08HT8T,
>  BIOS 2.1.8 04/30/2019
>  maj 27 23:17:02 debian kernel: Call Trace:
>  maj 27 23:17:02 debian kernel: 
>  maj 27 23:17:02 debian kernel: dump_stack+0x5c/0x80
>  maj 27 23:17:02 debian kernel: nmi_cpu_backtrace.cold.4+0x13/0x50
>  maj 27 23:17:02 debian kernel: ? lapic_can_unplug_cpu.cold.29+0x3b/0x3b
>  maj 27 23:17:02 debian kernel: nmi_trigger_cpumask_backtrace+0xf9/0xfb
>  maj 27 23:17:02 debian kernel: rcu_dump_cpu_stacks+0x9b/0xcb
>  maj 27 23:17:02 debian kernel: rcu_check_callbacks.cold.81+0x1db/0x335
>  maj 27 23:17:02 debian kernel: ? tick_sched_do_timer+0x60/0x60
>  maj 27 23:17:02 debian kernel: update_process_times+0x28/0x60
>  maj 27 23:17:02 debian kernel: tick_sched_handle+0x22/0x60
> 
>  ---
> 
>  This usually results in machine being completely unresponsive and 
>  performing an
>  automated reboot after some time.
> 
>  I've bisected commits between 4.12 and 4.13 and it seems like this is 
>  the patch
>  which introduced a bug:
>  https://github.com/xen-project/xen/commit/7c7b407e77724f37c4b448930777a59a479feb21
> 
>  Enclosed you can find the `xl dmesg` log (attachment: dmesg.txt) from 
>  the fresh
>  boot of the machine on which the bug was reproduced.
> 
>  I'm also attaching the `xl info` output from this machine:
> 
>  ---
> 
>  release : 4.19.0-6-amd64
>  version : #1 SMP Debian 4.19.67-2+deb10u2 (2019-11-11)
>  machine : x86_64
>  nr_cpus : 14
>  max_cpu_id : 223
>  nr_nodes : 1
>  cores_per_socket : 14
>  threads_per_core : 1
>  cpu_mhz : 2593.930
>  hw_caps :
>  bfebfbff:77fef3ff:2c100800:0121:000f:d19b:0008:0100
>  virt_caps : pv hvm hvm_directio pv_directio hap shadow
>  total_memory : 130541
>  free_memory : 63591
>  sharing_freed_memory : 0
>  sharing_used_memory : 0
>  outstanding_claims : 0
>  free_cpus : 0
>  xen_major : 4
>  xen_minor : 13
>  xen_extra : -unstable
>  xen_version : 4.13-unstable
>  xen_caps : xen-3.0-x86_64 xen-3.0-x86_32p hvm-3.0-x86_32 hvm-3.0-x86_32p
>  hvm-3.0-x86_64
>  xen_scheduler : credit2
>  xen_pagesize : 4096
>  platform_params : virt_start=0x8000
>  xen_changeset : Wed Oct 2 09:27:27 2019 +0200 git:7c7b407e77-dirty
> >>>
> >>> Which is your original Xen base? This output is clearly obtained at the
> >>> end of the bisect process.
> >>>
> >>> There have been quite some corrections since the release of Xen 4.13, so
> >>> please make sure you are running the most actual version (4.13.1).
> >>>
> >>>
> >>> Juergen
> >>
> >> Sure, we have tested both RELEASE 4.13 and RELEASE 4.13.1. Unfortunately 
> >> these
> >> corrections didn't help and the bug is still reproducible.
> >>
> >>  From our testing it turns out that:
> >>
> >> Known working revision: 997d6248a9ae932d0dbaac8d8755c2b15fec25dc
> >> Broken revision: 6278553325a9f76d37811923221b21db3882e017
> >> First bad commit: 7c7b407e77724f37c4b448930777a59a479feb21
> >
> > Would it be possible to test xen unstable, too?
> >
> > I could imagine e.g. commit b492c65da5ec5ed or 99266e31832fb4a4 to have
> > an impact here.
> >
> >
> > Juergen
>
>
> I've tried b492c65da5ec5ed revision but it seems that there is some problem 
> with ALTP2M support, so I can't launch anything at all.
>
> maj 29 15:45:32 debian drakrun[1223]: Failed to set HVM_PARAM_ALTP2M, RC: -1
> maj 29 15:45:32 debian drakrun[1223]: VMI_ERROR: xc_altp2m_switch_to_view 
> returned rc: -1

Ough, great, that's another regression in 4.14-unstable. I ran into it
myself but couldn't spend time to figure out whether its just
something in my

[PATCH for-4.14] tools/libxl: fix setting altp2m param broken by 1e9bc407cf0

2020-05-29 Thread Tamas K Lengyel

The patch 1e9bc407cf0 mistakenly converted the altp2m config option to a
boolean. This is incorrect and breaks external-only usecases of altp2m that
is set with a value of 2.

Signed-off-by: Tamas K Lengyel 
---
 tools/libxl/libxl_x86.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tools/libxl/libxl_x86.c b/tools/libxl/libxl_x86.c
index f8bc828e62..272736850b 100644
--- a/tools/libxl/libxl_x86.c
+++ b/tools/libxl/libxl_x86.c
@@ -391,7 +391,6 @@ static int hvm_set_conf_params(libxl__gc *gc, uint32_t 
domid,
 libxl_ctx *ctx = libxl__gc_owner(gc);
 xc_interface *xch = ctx->xch;
 int ret = ERROR_FAIL;
-bool altp2m = info->altp2m;
 
 switch(info->type) {
 case LIBXL_DOMAIN_TYPE_HVM:
@@ -433,7 +432,7 @@ static int hvm_set_conf_params(libxl__gc *gc, uint32_t 
domid,
 LOG(ERROR, "Couldn't set HVM_PARAM_NESTEDHVM");
 goto out;
 }
-if (xc_hvm_param_set(xch, domid, HVM_PARAM_ALTP2M, altp2m)) {
+if (xc_hvm_param_set(xch, domid, HVM_PARAM_ALTP2M, info->altp2m)) {
 LOG(ERROR, "Couldn't set HVM_PARAM_ALTP2M");
 goto out;
 }
-- 
2.26.2

Re: [BUG] Core scheduling patches causing deadlock in some situations

2020-05-29 Thread Tamas K Lengyel

On Fri, May 29, 2020 at 8:48 AM Tamas K Lengyel
 wrote:
>
> On Fri, May 29, 2020 at 7:51 AM Michał Leszczyński
>  wrote:
> >
> > - 29 maj 2020 o 15:15, Jürgen Groß jgr...@suse.com napisał(a):
> >
> > > On 29.05.20 14:51, Michał Leszczyński wrote:
> > >> - 29 maj 2020 o 14:44, Jürgen Groß jgr...@suse.com napisał(a):
> > >>
> > >>> On 29.05.20 14:30, Michał Leszczyński wrote:
> > >>>> Hello,
> > >>>>
> > >>>> I'm running DRAKVUF on Dell Inc. PowerEdge R640/08HT8T server with 
> > >>>> Intel(R)
> > >>>> Xeon(R) Gold 6132 CPU @ 2.60GHz CPU.
> > >>>> When upgrading from Xen RELEASE 4.12 to 4.13, we have noticed some 
> > >>>> stability
> > >>>> problems concerning freezes of Dom0 (Debian Buster):
> > >>>>
> > >>>> ---
> > >>>>
> > >>>> maj 27 23:17:02 debian kernel: rcu: INFO: rcu_sched self-detected 
> > >>>> stall on CPU
> > >>>> maj 27 23:17:02 debian kernel: rcu: 0-: (5250 ticks this GP)
> > >>>> idle=cee/1/0x4002 softirq=11964/11964 fqs=2515
> > >>>> maj 27 23:17:02 debian kernel: rcu: (t=5251 jiffies g=27237 q=799)
> > >>>> maj 27 23:17:02 debian kernel: NMI backtrace for cpu 0
> > >>>> maj 27 23:17:02 debian kernel: CPU: 0 PID: 643 Comm: z_rd_int_1 
> > >>>> Tainted: P OE
> > >>>> 4.19.0-6-amd64 #1 Debian 4.19.67-2+deb10u2
> > >>>> maj 27 23:17:02 debian kernel: Hardware name: Dell Inc. PowerEdge 
> > >>>> R640/08HT8T,
> > >>>> BIOS 2.1.8 04/30/2019
> > >>>> maj 27 23:17:02 debian kernel: Call Trace:
> > >>>> maj 27 23:17:02 debian kernel: 
> > >>>> maj 27 23:17:02 debian kernel: dump_stack+0x5c/0x80
> > >>>> maj 27 23:17:02 debian kernel: nmi_cpu_backtrace.cold.4+0x13/0x50
> > >>>> maj 27 23:17:02 debian kernel: ? lapic_can_unplug_cpu.cold.29+0x3b/0x3b
> > >>>> maj 27 23:17:02 debian kernel: nmi_trigger_cpumask_backtrace+0xf9/0xfb
> > >>>> maj 27 23:17:02 debian kernel: rcu_dump_cpu_stacks+0x9b/0xcb
> > >>>> maj 27 23:17:02 debian kernel: rcu_check_callbacks.cold.81+0x1db/0x335
> > >>>> maj 27 23:17:02 debian kernel: ? tick_sched_do_timer+0x60/0x60
> > >>>> maj 27 23:17:02 debian kernel: update_process_times+0x28/0x60
> > >>>> maj 27 23:17:02 debian kernel: tick_sched_handle+0x22/0x60
> > >>>>
> > >>>> ---
> > >>>>
> > >>>> This usually results in machine being completely unresponsive and 
> > >>>> performing an
> > >>>> automated reboot after some time.
> > >>>>
> > >>>> I've bisected commits between 4.12 and 4.13 and it seems like this is 
> > >>>> the patch
> > >>>> which introduced a bug:
> > >>>> https://github.com/xen-project/xen/commit/7c7b407e77724f37c4b448930777a59a479feb21
> > >>>>
> > >>>> Enclosed you can find the `xl dmesg` log (attachment: dmesg.txt) from 
> > >>>> the fresh
> > >>>> boot of the machine on which the bug was reproduced.
> > >>>>
> > >>>> I'm also attaching the `xl info` output from this machine:
> > >>>>
> > >>>> ---
> > >>>>
> > >>>> release : 4.19.0-6-amd64
> > >>>> version : #1 SMP Debian 4.19.67-2+deb10u2 (2019-11-11)
> > >>>> machine : x86_64
> > >>>> nr_cpus : 14
> > >>>> max_cpu_id : 223
> > >>>> nr_nodes : 1
> > >>>> cores_per_socket : 14
> > >>>> threads_per_core : 1
> > >>>> cpu_mhz : 2593.930
> > >>>> hw_caps :
> > >>>> bfebfbff:77fef3ff:2c100800:0121:000f:d19b:0008:0100
> > >>>> virt_caps : pv hvm hvm_directio pv_directio hap shadow
> > >>>> total_memory : 130541
> > >>>> free_memory : 63591
> > >>>> sharing_freed_memory : 0
> > >>>> sharing_used_memory : 0
> > >>>> outstanding_claims : 0
> > >>>> free_cpus : 0
> > >>>> xen_major : 4
> > >>>> xen_minor : 13
> > >>>> xen_extra : -unstable
> > >>>> xen_

Re: [PATCH for-4.14] tools/libxl: fix setting altp2m param broken by 1e9bc407cf0

2020-05-29 Thread Tamas K Lengyel

On Fri, May 29, 2020 at 10:15 AM Andrew Cooper
 wrote:
>
> On 29/05/2020 17:06, Tamas K Lengyel wrote:
> > The patch 1e9bc407cf0 mistakenly converted the altp2m config option to a
> > boolean. This is incorrect and breaks external-only usecases of altp2m that
> > is set with a value of 2.
> >
> > Signed-off-by: Tamas K Lengyel 
>
> Urg yes.  Sorry.
>
> However, this doesn't build because there is another use of the altp2m
> variable between the two hunks below, for compatiblity with the older
> altp2mhvm option.

Eh, so much for hastily sending a patch with last minute changes.

>
> I think changing its type just to int out to suffice?

Indeed, that would work as well. Let me just resend with that.

Tamas

[PATCH v2 for-4.14] tools/libxl: fix setting altp2m param broken by 1e9bc407cf0

2020-05-29 Thread Tamas K Lengyel

The patch 1e9bc407cf0 mistakenly converted the altp2m config option to a
boolean. This is incorrect and breaks external-only usecases of altp2m that
is set with a value of 2.

Signed-off-by: Tamas K Lengyel 
---
v2: just convert bool to unsigned int
---
 tools/libxl/libxl_x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/libxl/libxl_x86.c b/tools/libxl/libxl_x86.c
index f8bc828e62..e57f63282e 100644
--- a/tools/libxl/libxl_x86.c
+++ b/tools/libxl/libxl_x86.c
@@ -391,7 +391,7 @@ static int hvm_set_conf_params(libxl__gc *gc, uint32_t 
domid,
 libxl_ctx *ctx = libxl__gc_owner(gc);
 xc_interface *xch = ctx->xch;
 int ret = ERROR_FAIL;
-bool altp2m = info->altp2m;
+unsigned int altp2m = info->altp2m;
 
 switch(info->type) {
 case LIBXL_DOMAIN_TYPE_HVM:
-- 
2.26.2

Re: [PATCH v2 for-4.14] tools/libxl: fix setting altp2m param broken by 1e9bc407cf0

2020-05-29 Thread Tamas K Lengyel

On Fri, May 29, 2020 at 10:32 AM Ian Jackson  wrote:
>
> Andrew Cooper writes ("Re: [PATCH v2 for-4.14] tools/libxl: fix setting 
> altp2m param broken by 1e9bc407cf0"):
> > On 29/05/2020 17:22, Tamas K Lengyel wrote:
> > > The patch 1e9bc407cf0 mistakenly converted the altp2m config option to a
> > > boolean. This is incorrect and breaks external-only usecases of altp2m 
> > > that
> > > is set with a value of 2.
> > >
> > > Signed-off-by: Tamas K Lengyel 
> >
> > Reviewed-by: Andrew Cooper 
> >
> > Sorry for breaking it to begin with.
>
> Acked-by: Ian Jackson 
>
> and pushed.

Thanks for the fast turn around.

Tamas

Re: [PATCH v19 for-4.14 00/13] VM forking

2020-06-01 Thread Tamas K Lengyel

On Mon, Jun 1, 2020 at 11:11 AM George Dunlap  wrote:
>
>
>
> > On Jun 1, 2020, at 4:07 PM, Paul Durrant  wrote:
> >
> >> -Original Message-
> >> From: Xen-devel  On Behalf Of 
> >> Tamas K Lengyel
> >> Sent: 01 June 2020 14:22
> >> To: xen-devel@lists.xenproject.org
> >> Cc: Kevin Tian ; Stefano Stabellini 
> >> ; Tamas K Lengyel
> >> ; Jun Nakajima ; Wei Liu 
> >> ; Andrew Cooper
> >> ; Ian Jackson ; 
> >> George Dunlap
> >> ; Tamas K Lengyel ; Jan 
> >> Beulich ;
> >> Anthony PERARD ; Julien Grall ; 
> >> Roger Pau Monné
> >> 
> >> Subject: [PATCH v19 for-4.14 00/13] VM forking
> >
> > Hi,
> >
> >  This series looks to be largely un-acked so, since we are now past the 
> > freeze date, I don't really think it can go into 4.14. Is there a 
> > particular reason that you think it should be considered?
>
> Tamas’ project itself mainly uses libxc and below, as I understand; and so 
> getting patches 1 and 2 in would be an important milestone; both have had 
> R-b’s before the feature freeze.  Arguably patches 1 and 2 are a bug fix.  
> Patch 1 is missing VMX (or a general x86).

Correct. The first two patches going in would decide whether we will
be able to use the 4.14 release without having to carry out-of-tree
patches. Although as things stand at the moment regarding all the bugs
being discovered in 4.13 and 4.14 we will likely still have to
backport all of these patches to 4.12 by hand.

> The libxl/xl side hasn’t, as I understand it, had significant review; I think 
> that should probably wait until 4.15.

Correct. It has been sent 19 times so far over a period of 9 months
with no feedback from any of the maintainers other then that it's hard
to review. We had some good discussion with other community members
but evidently non of the toolstack maintainers care too much about it.
I made the last-ditch effort to make it easier to review but at this
point we started implementing our own toolstack to interact with VM
forks.

> What do you think, Tamas?

If it's not going into 4.14 then it's going to be dropped. It has been
made solely for the benefit of the community to make the new VM
forking more accessible and useful for others. Without it the only way
to use the feature is to implement your own toolstack. Initially we
were hoping that integrating support to xl/libxl would eliminate the
need for us to implement our own parallel toolstack but since we have
to do that now anyway there is no benefit for us in carrying these
patches any further. It's disheartening we had to resort to that and I
certainly will try to avoid contributing to xl/libxl in the future
since I personally consider it a waste of time.

Thanks,
Tamas

Re: [PATCH v2 for-4.14 0/3] vm_event: fix race-condition when disabling monitor events

2020-06-01 Thread Tamas K Lengyel

On Wed, May 20, 2020 at 8:31 PM Tamas K Lengyel  wrote:
>
> For the last couple years we have received numerous reports from users of
> monitor vm_events of spurious guest crashes when using events. In particular,
> it has observed that the problem occurs when vm_events are being disabled. The
> nature of the guest crash varied widely and has only occured occasionally. 
> This
> made debugging the issue particularly hard. We had discussions about this 
> issue
> even here on the xen-devel mailinglist with no luck figuring it out.
>
> The bug has now been identified as a race-condition between register event
> handling and disabling the vm_event interface.
>
> Patch 96760e2fba100d694300a81baddb5740e0f8c0ee, "vm_event: deny register 
> writes
> if refused by  vm_event reply" is the patch that introduced the error. In this
> patch emulation of register write events can be postponed until the
> corresponding vm_event handler decides whether to allow such write to take
> place. Unfortunately this can only be implemented by performing the deny/allow
> step when the vCPU gets scheduled. Due to that postponed emulation of the 
> event
> if the user decides to pause the VM in the vm_event handler and then disable
> events, the entire emulation step is skipped the next time the vCPU is 
> resumed.
> Even if the user doesn't pause during the vm_event handling but exits
> immediately and disables vm_event, the situation becomes racey as disabling
> vm_event may succeed before the guest's vCPUs get scheduled with the pending
> emulation task. This has been particularly the case with VMS  that have 
> several
> vCPUs as after the VM is unpaused it may actually take a long time before all
> vCPUs get scheduled.
>
> The only solution currently is to poll each vCPU before vm_events are disabled
> to verify they had been scheduled before it is safe to disable vm_events. The
> following patches resolve this issue in a much nicer way.
>
> Patch 1 adds an option to the monitor_op domctl that needs to be specified if
> the user wants to actually use the postponed register-write handling
> mechanism. If that option is not specified then handling is performed the
> same way as before patch 96760e2fba100d694300a81baddb5740e0f8c0ee.
>
> Patch 2 performs sanity checking when disabling vm_events to determine whether
> its safe to free all vm_event structures. The vCPUs still get unpaused to
> allow them to get scheduled and perform any of their pending operations,
> but otherwise an -EAGAIN error is returned signaling to the user that they
> need to wait and try again disabling the interface.
>
> Patch 3 adds a vm_event specifically to signal to the user when it is safe to
> continue disabling the interface.
>
> Shout out to our friends at CERT.pl for stumbling upon a crucial piece of
> information that lead to finally squashing this nasty bug.
>
> v2: minor adjustments based on Jan's comments

Patch ping.

Tamas

[PATCH v19 for-4.14 08/13] tools/libxl: Adjust libxl__build_post

2020-06-01 Thread Tamas K Lengyel

Skips parts not relevant to VM forks.

Signed-off-by: Tamas K Lengyel 
---
 tools/libxl/libxl_dom.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/tools/libxl/libxl_dom.c b/tools/libxl/libxl_dom.c
index 1b55097a1a..52d49437cc 100644
--- a/tools/libxl/libxl_dom.c
+++ b/tools/libxl/libxl_dom.c
@@ -455,6 +455,9 @@ int libxl__build_post(libxl__gc *gc, uint32_t domid,
 char **ents;
 int i, rc;
 
+if (state->forked_vm)
+goto skip_fork;
+
 if (info->num_vnuma_nodes && !info->num_vcpu_soft_affinity) {
 rc = set_vnuma_affinity(gc, domid, info);
 if (rc)
@@ -475,6 +478,7 @@ int libxl__build_post(libxl__gc *gc, uint32_t domid,
 }
 }
 
+skip_fork:
 ents = libxl__calloc(gc, 12 + (info->max_vcpus * 2) + 2, sizeof(char *));
 ents[0] = "memory/static-max";
 ents[1] = GCSPRINTF("%"PRId64, info->max_memkb);
-- 
2.25.1

[PATCH v19 for-4.14 06/13] tools/libxl: adjust domcreate_bootloader_done

2020-06-01 Thread Tamas K Lengyel

Add special handling when only the the device model needs launching for forks.

Signed-off-by: Tamas K Lengyel 
---
 tools/libxl/libxl_create.c   | 9 +
 tools/libxl/libxl_internal.h | 1 +
 2 files changed, 10 insertions(+)

diff --git a/tools/libxl/libxl_create.c b/tools/libxl/libxl_create.c
index 3f0745acc6..ab3ac096ee 100644
--- a/tools/libxl/libxl_create.c
+++ b/tools/libxl/libxl_create.c
@@ -1376,6 +1376,15 @@ static void domcreate_bootloader_done(libxl__egc *egc,
 return;
 }
 
+if (d_config->dm_restore_file) {
+dcs->srs.dcs = dcs;
+dcs->srs.ao = ao;
+state->forked_vm = true;
+rc = libxl__domain_build(gc, d_config, domid, state);
+domcreate_rebuild_done(egc, dcs, rc);
+return;
+}
+
 /* Prepare environment for domcreate_stream_done */
 dcs->srs.dcs = dcs;
 
diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h
index 19b367daca..eaae955658 100644
--- a/tools/libxl/libxl_internal.h
+++ b/tools/libxl/libxl_internal.h
@@ -1376,6 +1376,7 @@ typedef struct {
 
 char *saved_state;
 int dm_monitor_fd;
+bool forked_vm;
 
 libxl__file_reference pv_kernel;
 libxl__file_reference pv_ramdisk;
-- 
2.25.1

[PATCH v19 for-4.14 04/13] tools/libxl: populate xenstore entries when launching dm for VM fork

2020-06-01 Thread Tamas K Lengyel

No need to call libxl__domain_make since the domain already exists, only need
to populate the xenstore entries via libxl__domain_make_xs_entries.

Signed-off-by: Tamas K Lengyel 
---
 tools/libxl/libxl_create.c  | 11 ++-
 tools/libxl/libxl_types.idl |  1 +
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/tools/libxl/libxl_create.c b/tools/libxl/libxl_create.c
index 09cf99d304..c3614e5a30 100644
--- a/tools/libxl/libxl_create.c
+++ b/tools/libxl/libxl_create.c
@@ -1244,7 +1244,13 @@ static void initiate_domain_create(libxl__egc *egc,
 ret = libxl__domain_config_setdefault(gc,d_config,domid);
 if (ret) goto error_out;
 
-ret = libxl__domain_make(gc, d_config, dbs, , dcs->soft_reset);
+/* If no dm_restore_file is specified we are in the normal path */
+if (!d_config->dm_restore_file)
+ret = libxl__domain_make(gc, d_config, dbs, , dcs->soft_reset);
+else
+ret = libxl__domain_make_xs_entries(gc, d_config, >build_state,
+domid);
+
 if (ret) {
 LOGD(ERROR, domid, "cannot make domain: %d", ret);
 dcs->guest_domid = domid;
@@ -2052,6 +2058,9 @@ static int do_domain_create(libxl_ctx *ctx, 
libxl_domain_config *d_config,
 cdcs->dcs.domid = INVALID_DOMID;
 cdcs->dcs.soft_reset = false;
 
+if (d_config->dm_restore_file)
+cdcs->dcs.domid = *domid;
+
 if (cdcs->dcs.restore_params.checkpointed_stream ==
 LIBXL_CHECKPOINTED_STREAM_COLO) {
 cdcs->dcs.colo_proxy_script =
diff --git a/tools/libxl/libxl_types.idl b/tools/libxl/libxl_types.idl
index 9d3f05f399..b9cc139b0a 100644
--- a/tools/libxl/libxl_types.idl
+++ b/tools/libxl/libxl_types.idl
@@ -961,6 +961,7 @@ libxl_domain_config = Struct("domain_config", [
 ("on_watchdog", libxl_action_on_shutdown),
 ("on_crash", libxl_action_on_shutdown),
 ("on_soft_reset", libxl_action_on_shutdown),
+("dm_restore_file", string, {'const': True}),
 ], dir=DIR_IN)
 
 libxl_diskinfo = Struct("diskinfo", [
-- 
2.25.1

[PATCH v19 for-4.14 00/13] VM forking

2020-06-01 Thread Tamas K Lengyel

The following patches are part of the series that implement VM forking for
Intel HVM guests to allow for the fast creation of identical VMs without the
assosciated high startup costs of booting or restoring the VM from a savefile.

JIRA issue: https://xenproject.atlassian.net/browse/XEN-89

The fork operation is implemented as part of the "xl fork-vm" command:
xl fork-vm -C  -Q  

By default a fully functional fork is created. The user is in charge however to
create the appropriate config file for the fork and to generate the QEMU save
file before the fork-vm call is made. The config file needs to give the
fork a new name at minimum but other settings may also require changes. Certain
settings in the config file of both the parent and the fork have to be set to
default. Details are documented.

The interface also allows to split the forking into two steps:
xl fork-vm --launch-dm no \
   -m  \
   -p 
xl fork-vm --launch-dm late \
   -C  \
   -Q  \
   

The split creation model is useful when the VM needs to be created as fast as
possible. The forked VM can be unpaused without the device model being launched
to be monitored and accessed via VMI. Note however that without its device
model running (depending on what is executing in the VM) it is bound to
misbehave or even crash when its trying to access devices that would be
emulated by QEMU. We anticipate that for certain use-cases this would be an
acceptable situation, in case for example when fuzzing is performed of code
segments that don't access such devices.

Launching the device model requires the QEMU Xen savefile to be generated
manually from the parent VM. This can be accomplished simply by connecting to
its QMP socket and issuing the "xen-save-devices-state" command. For example
using the standard tool socat these commands can be used to generate the file:
socat - UNIX-CONNECT:/var/run/xen/qmp-libxl-
{ "execute": "qmp_capabilities" }
{ "execute": "xen-save-devices-state", \
"arguments": { "filename": "/path/to/save/qemu_state", \
"live": false} }

The series has been tested with Windows VMs and functions as expected. Linux
VMs when forked from a running VM will have a frozen VNC screen. Linux VMs at
this time can only be forked with a working device model when the parent VM was
restored from a snapshot using "xl restore -p". This is a known limitation due
to Linux VMs having to be made aware of being saved/migrated.

New in v19:
Including all the patches currently outstanding into the series
Breaking up libxl/xl patch to many sub-patches to make it easier to 
review
libxl/xl is now reduced to the bare essential to launch QEMU for a VM 
fork

Tamas K Lengyel (13):
  x86/mem_sharing: block interrupt injection for forks
  tools/libxc: xc_memshr_fork with interrupts blocked
  tools/libxl: Split libxl__domain_make
  tools/libxl: populate xenstore entries when launching dm for VM fork
  tools/libxl: Add checks for dm_restore_file
  tools/libxl: adjust domcreate_bootloader_done
  tools/libxl: Adjust libxl__build_pre
  tools/libxl: Adjust libxl__build_post
  tools/libxl: libxl__build_hvm_fork
  tools/libxl: set QEMU saved_state from dm_restore_file
  tools/libxl: Add VM forking public functions
  tools/xl: Add xl fork-vm command
  tools/xl: document fork-vm command

 docs/man/xl.1.pod.in |  39 +
 tools/libxc/include/xenctrl.h|   3 +-
 tools/libxc/xc_memshr.c  |   4 +-
 tools/libxl/libxl.h  |  10 +++
 tools/libxl/libxl_create.c   | 134 +--
 tools/libxl/libxl_dm.c   |   2 +-
 tools/libxl/libxl_dom.c  |  59 +++---
 tools/libxl/libxl_internal.h |   5 +-
 tools/libxl/libxl_types.idl  |   1 +
 tools/xl/Makefile|   2 +-
 tools/xl/xl.h|   4 +
 tools/xl/xl_cmdtable.c   |  13 +++
 tools/xl/xl_forkvm.c | 122 
 tools/xl/xl_vmcontrol.c  |  13 +++
 xen/arch/x86/hvm/vmx/intr.c  |   6 ++
 xen/arch/x86/mm/mem_sharing.c|   6 +-
 xen/include/asm-x86/hvm/domain.h |   2 +-
 xen/include/public/memory.h  |   3 +
 18 files changed, 383 insertions(+), 45 deletions(-)
 create mode 100644 tools/xl/xl_forkvm.c

-- 
2.25.1

[PATCH v19 for-4.14 02/13] tools/libxc: xc_memshr_fork with interrupts blocked

2020-06-01 Thread Tamas K Lengyel

Toolstack side for creating forks with interrupt injection blocked.

Signed-off-by: Tamas K Lengyel 
Reviewed-by: Roger Pau Monné 
Acked-by: Ian Jackson 
---
 tools/libxc/include/xenctrl.h | 3 ++-
 tools/libxc/xc_memshr.c   | 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/tools/libxc/include/xenctrl.h b/tools/libxc/include/xenctrl.h
index f9e17ae424..51de46 100644
--- a/tools/libxc/include/xenctrl.h
+++ b/tools/libxc/include/xenctrl.h
@@ -2241,7 +2241,8 @@ int xc_memshr_range_share(xc_interface *xch,
 int xc_memshr_fork(xc_interface *xch,
uint32_t source_domain,
uint32_t client_domain,
-   bool allow_with_iommu);
+   bool allow_with_iommu,
+   bool block_interrupts);
 
 /*
  * Note: this function is only intended to be used on short-lived forks that
diff --git a/tools/libxc/xc_memshr.c b/tools/libxc/xc_memshr.c
index 2300cc7075..a6cfd7dccf 100644
--- a/tools/libxc/xc_memshr.c
+++ b/tools/libxc/xc_memshr.c
@@ -240,7 +240,7 @@ int xc_memshr_debug_gref(xc_interface *xch,
 }
 
 int xc_memshr_fork(xc_interface *xch, uint32_t pdomid, uint32_t domid,
-   bool allow_with_iommu)
+   bool allow_with_iommu, bool block_interrupts)
 {
 xen_mem_sharing_op_t mso;
 
@@ -251,6 +251,8 @@ int xc_memshr_fork(xc_interface *xch, uint32_t pdomid, 
uint32_t domid,
 
 if ( allow_with_iommu )
 mso.u.fork.flags |= XENMEM_FORK_WITH_IOMMU_ALLOWED;
+if ( block_interrupts )
+mso.u.fork.flags |= XENMEM_FORK_BLOCK_INTERRUPTS;
 
 return xc_memshr_memop(xch, domid, );
 }
-- 
2.25.1

[PATCH v19 for-4.14 09/13] tools/libxl: libxl__build_hvm_fork

2020-06-01 Thread Tamas K Lengyel

Add libxl__build_hvm_fork function that performs only the steps needed for VM
forks, skipping a large chunk of libxl__build_hvm.

Signed-off-by: Tamas K Lengyel 
---
 tools/libxl/libxl_dom.c | 32 +---
 1 file changed, 29 insertions(+), 3 deletions(-)

diff --git a/tools/libxl/libxl_dom.c b/tools/libxl/libxl_dom.c
index 52d49437cc..28117f0907 100644
--- a/tools/libxl/libxl_dom.c
+++ b/tools/libxl/libxl_dom.c
@@ -741,14 +741,15 @@ static int hvm_build_set_params(xc_interface *handle, 
uint32_t domid,
 libxl_domain_build_info *info,
 int store_evtchn, unsigned long *store_mfn,
 int console_evtchn, unsigned long *console_mfn,
-domid_t store_domid, domid_t console_domid)
+domid_t store_domid, domid_t console_domid,
+bool forked_vm)
 {
 struct hvm_info_table *va_hvm;
 uint8_t *va_map, sum;
 uint64_t str_mfn, cons_mfn;
 int i;
 
-if (info->type == LIBXL_DOMAIN_TYPE_HVM) {
+if (info->type == LIBXL_DOMAIN_TYPE_HVM && !forked_vm) {
 va_map = xc_map_foreign_range(handle, domid,
   XC_PAGE_SIZE, PROT_READ | PROT_WRITE,
   HVM_INFO_PFN);
@@ -1053,6 +1054,28 @@ out:
 return rc;
 }
 
+static int libxl__build_hvm_fork(libxl__gc *gc, uint32_t domid,
+ libxl_domain_config *d_config,
+ libxl__domain_build_state *state)
+{
+libxl_ctx *ctx = libxl__gc_owner(gc);
+libxl_domain_build_info *const info = _config->b_info;
+
+int rc = hvm_build_set_params(ctx->xch, domid, info, state->store_port,
+  >store_mfn, state->console_port,
+  >console_mfn, state->store_domid,
+  state->console_domid, state->forked_vm);
+
+if ( rc )
+return rc;
+
+return xc_dom_gnttab_seed(ctx->xch, domid, true,
+  state->console_mfn,
+  state->store_mfn,
+  state->console_domid,
+  state->store_domid);
+}
+
 int libxl__build_hvm(libxl__gc *gc, uint32_t domid,
   libxl_domain_config *d_config,
   libxl__domain_build_state *state)
@@ -1064,6 +1087,9 @@ int libxl__build_hvm(libxl__gc *gc, uint32_t domid,
 struct xc_dom_image *dom = NULL;
 bool device_model = info->type == LIBXL_DOMAIN_TYPE_HVM ? true : false;
 
+if (state->forked_vm)
+return libxl__build_hvm_fork(gc, domid, d_config, state);
+
 xc_dom_loginit(ctx->xch);
 
 /*
@@ -1188,7 +1214,7 @@ int libxl__build_hvm(libxl__gc *gc, uint32_t domid,
 rc = hvm_build_set_params(ctx->xch, domid, info, state->store_port,
>store_mfn, state->console_port,
>console_mfn, state->store_domid,
-   state->console_domid);
+   state->console_domid, false);
 if (rc != 0) {
 LOG(ERROR, "hvm build set params failed");
 goto out;
-- 
2.25.1

[PATCH v19 for-4.14 01/13] x86/mem_sharing: block interrupt injection for forks

2020-06-01 Thread Tamas K Lengyel

When running VM forks without device models (QEMU), it may
be undesirable for Xen to inject interrupts. When creating such forks from
Windows VMs we have observed the kernel trying to process interrupts
immediately after the fork is executed. However without QEMU running such
interrupt handling may not be possible because it may attempt to interact with
devices that are not emulated by a backend. In the best case scenario such
interrupt handling would only present a detour in the VM forks' execution
flow, but in the worst case as we actually observed can completely stall it.
By disabling interrupt injection a fuzzer can exercise the target code without
interference. For other use-cases this option probably doesn't make sense,
that's why this is not enabled by default.

Forks & memory sharing are only available on Intel CPUs so this only applies
to vmx. Note that this is part of the experimental VM forking feature that's
completely disabled by default and can only be enabled by using
XEN_CONFIG_EXPERT during compile time.

Signed-off-by: Tamas K Lengyel 
Reviewed-by: Roger Pau Monné 
---
 xen/arch/x86/hvm/vmx/intr.c  | 6 ++
 xen/arch/x86/mm/mem_sharing.c| 6 +-
 xen/include/asm-x86/hvm/domain.h | 2 +-
 xen/include/public/memory.h  | 3 +++
 4 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/xen/arch/x86/hvm/vmx/intr.c b/xen/arch/x86/hvm/vmx/intr.c
index 000e14af49..80bfbb4787 100644
--- a/xen/arch/x86/hvm/vmx/intr.c
+++ b/xen/arch/x86/hvm/vmx/intr.c
@@ -256,6 +256,12 @@ void vmx_intr_assist(void)
 if ( unlikely(v->arch.vm_event) && v->arch.vm_event->sync_event )
 return;
 
+#ifdef CONFIG_MEM_SHARING
+/* Block event injection for VM fork if requested */
+if ( unlikely(v->domain->arch.hvm.mem_sharing.block_interrupts) )
+return;
+#endif
+
 /* Crank the handle on interrupt state. */
 pt_vector = pt_update_irq(v);
 
diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
index 19922ab5d1..c428fd16ce 100644
--- a/xen/arch/x86/mm/mem_sharing.c
+++ b/xen/arch/x86/mm/mem_sharing.c
@@ -2106,7 +2106,8 @@ int 
mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg)
 rc = -EINVAL;
 if ( mso.u.fork.pad )
 goto out;
-if ( mso.u.fork.flags & ~XENMEM_FORK_WITH_IOMMU_ALLOWED )
+if ( mso.u.fork.flags &
+ ~(XENMEM_FORK_WITH_IOMMU_ALLOWED | XENMEM_FORK_BLOCK_INTERRUPTS) )
 goto out;
 
 rc = rcu_lock_live_remote_domain_by_id(mso.u.fork.parent_domain,
@@ -2134,6 +2135,9 @@ int 
mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg)
 rc = hypercall_create_continuation(__HYPERVISOR_memory_op,
"lh", XENMEM_sharing_op,
arg);
+else if ( !rc && (mso.u.fork.flags & XENMEM_FORK_BLOCK_INTERRUPTS) )
+d->arch.hvm.mem_sharing.block_interrupts = true;
+
 rcu_unlock_domain(pd);
 break;
 }
diff --git a/xen/include/asm-x86/hvm/domain.h b/xen/include/asm-x86/hvm/domain.h
index 95fe18cddc..9d247baf4d 100644
--- a/xen/include/asm-x86/hvm/domain.h
+++ b/xen/include/asm-x86/hvm/domain.h
@@ -67,7 +67,7 @@ struct hvm_ioreq_server {
 #ifdef CONFIG_MEM_SHARING
 struct mem_sharing_domain
 {
-bool enabled;
+bool enabled, block_interrupts;
 
 /*
  * When releasing shared gfn's in a preemptible manner, recall where
diff --git a/xen/include/public/memory.h b/xen/include/public/memory.h
index dbd35305df..850bd72c52 100644
--- a/xen/include/public/memory.h
+++ b/xen/include/public/memory.h
@@ -536,7 +536,10 @@ struct xen_mem_sharing_op {
 } debug;
 struct mem_sharing_op_fork {  /* OP_FORK */
 domid_t parent_domain;/* IN: parent's domain id */
+/* Only makes sense for short-lived forks */
 #define XENMEM_FORK_WITH_IOMMU_ALLOWED (1u << 0)
+/* Only makes sense for short-lived forks */
+#define XENMEM_FORK_BLOCK_INTERRUPTS   (1u << 1)
 uint16_t flags;   /* IN: optional settings */
 uint32_t pad; /* Must be set to 0 */
 } fork;
-- 
2.25.1

[PATCH v19 for-4.14 03/13] tools/libxl: Split libxl__domain_make

2020-06-01 Thread Tamas K Lengyel

Make part of libxl__domain_make into a separate function. No functional change.

Signed-off-by: Tamas K Lengyel 
---
 tools/libxl/libxl_create.c   | 62 +++-
 tools/libxl/libxl_internal.h |  4 ++-
 2 files changed, 42 insertions(+), 24 deletions(-)

diff --git a/tools/libxl/libxl_create.c b/tools/libxl/libxl_create.c
index 75862dc6ed..09cf99d304 100644
--- a/tools/libxl/libxl_create.c
+++ b/tools/libxl/libxl_create.c
@@ -579,15 +579,7 @@ int libxl__domain_make(libxl__gc *gc, libxl_domain_config 
*d_config,
uint32_t *domid, bool soft_reset)
 {
 libxl_ctx *ctx = libxl__gc_owner(gc);
-int ret, rc, nb_vm;
-const char *dom_type;
-char *uuid_string;
-char *dom_path, *vm_path, *libxl_path;
-struct xs_permissions roperm[2];
-struct xs_permissions rwperm[1];
-struct xs_permissions noperm[1];
-xs_transaction_t t = 0;
-libxl_vminfo *vm_list;
+int ret, rc;
 
 /* convenience aliases */
 libxl_domain_create_info *info = _config->c_info;
@@ -595,12 +587,6 @@ int libxl__domain_make(libxl__gc *gc, libxl_domain_config 
*d_config,
 
 assert(soft_reset || *domid == INVALID_DOMID);
 
-uuid_string = libxl__uuid2string(gc, info->uuid);
-if (!uuid_string) {
-rc = ERROR_NOMEM;
-goto out;
-}
-
 if (!soft_reset) {
 struct xen_domctl_createdomain create = {
 .ssidref = info->ssidref,
@@ -731,7 +717,37 @@ int libxl__domain_make(libxl__gc *gc, libxl_domain_config 
*d_config,
 goto out;
 }
 
-dom_path = libxl__xs_get_dompath(gc, *domid);
+rc = libxl__domain_make_xs_entries(gc, d_config, state, *domid);
+
+ out:
+return rc;
+}
+
+int libxl__domain_make_xs_entries(libxl__gc *gc, libxl_domain_config *d_config,
+  libxl__domain_build_state *state,
+  uint32_t domid)
+{
+libxl_ctx *ctx = libxl__gc_owner(gc);
+int rc, nb_vm;
+const char *dom_type;
+char *uuid_string;
+char *dom_path, *vm_path, *libxl_path;
+struct xs_permissions roperm[2];
+struct xs_permissions rwperm[1];
+struct xs_permissions noperm[1];
+xs_transaction_t t = 0;
+libxl_vminfo *vm_list;
+
+/* convenience aliases */
+libxl_domain_create_info *info = _config->c_info;
+
+uuid_string = libxl__uuid2string(gc, info->uuid);
+if (!uuid_string) {
+rc = ERROR_NOMEM;
+goto out;
+}
+
+dom_path = libxl__xs_get_dompath(gc, domid);
 if (!dom_path) {
 rc = ERROR_FAIL;
 goto out;
@@ -739,12 +755,12 @@ int libxl__domain_make(libxl__gc *gc, libxl_domain_config 
*d_config,
 
 vm_path = GCSPRINTF("/vm/%s", uuid_string);
 if (!vm_path) {
-LOGD(ERROR, *domid, "cannot allocate create paths");
+LOGD(ERROR, domid, "cannot allocate create paths");
 rc = ERROR_FAIL;
 goto out;
 }
 
-libxl_path = libxl__xs_libxl_path(gc, *domid);
+libxl_path = libxl__xs_libxl_path(gc, domid);
 if (!libxl_path) {
 rc = ERROR_FAIL;
 goto out;
@@ -755,10 +771,10 @@ int libxl__domain_make(libxl__gc *gc, libxl_domain_config 
*d_config,
 
 roperm[0].id = 0;
 roperm[0].perms = XS_PERM_NONE;
-roperm[1].id = *domid;
+roperm[1].id = domid;
 roperm[1].perms = XS_PERM_READ;
 
-rwperm[0].id = *domid;
+rwperm[0].id = domid;
 rwperm[0].perms = XS_PERM_NONE;
 
 retry_transaction:
@@ -776,7 +792,7 @@ retry_transaction:
 noperm, ARRAY_SIZE(noperm));
 
 xs_write(ctx->xsh, t, GCSPRINTF("%s/vm", dom_path), vm_path, 
strlen(vm_path));
-rc = libxl__domain_rename(gc, *domid, 0, info->name, t);
+rc = libxl__domain_rename(gc, domid, 0, info->name, t);
 if (rc)
 goto out;
 
@@ -866,7 +882,7 @@ retry_transaction:
 
 vm_list = libxl_list_vm(ctx, _vm);
 if (!vm_list) {
-LOGD(ERROR, *domid, "cannot get number of running guests");
+LOGD(ERROR, domid, "cannot get number of running guests");
 rc = ERROR_FAIL;
 goto out;
 }
@@ -890,7 +906,7 @@ retry_transaction:
 t = 0;
 goto retry_transaction;
 }
-LOGED(ERROR, *domid, "domain creation ""xenstore transaction commit 
failed");
+LOGED(ERROR, domid, "domain creation ""xenstore transaction commit 
failed");
 rc = ERROR_FAIL;
 goto out;
 }
diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h
index c7ece066c4..19b367daca 100644
--- a/tools/libxl/libxl_internal.h
+++ b/tools/libxl/libxl_internal.h
@@ -1983,7 +1983,9 @@ _hidden int libxl__domain_make(libxl__gc *gc,
libxl_domain_config *d_config,
libxl__domain_build_state *state,
u

[PATCH v19 for-4.14 05/13] tools/libxl: Add checks for dm_restore_file

2020-06-01 Thread Tamas K Lengyel

We can skip a bunch of steps a normal domain creation would entail, similar
to how domain restore & soft_reset skips them.

Signed-off-by: Tamas K Lengyel 
---
 tools/libxl/libxl_create.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/libxl/libxl_create.c b/tools/libxl/libxl_create.c
index c3614e5a30..3f0745acc6 100644
--- a/tools/libxl/libxl_create.c
+++ b/tools/libxl/libxl_create.c
@@ -1294,7 +1294,7 @@ static void initiate_domain_create(libxl__egc *egc,
 if (ret)
 goto error_out;
 
-if (dbs->restore || dcs->soft_reset) {
+if (dbs->restore || dcs->soft_reset || d_config->dm_restore_file) {
 LOGD(DEBUG, domid, "restoring, not running bootloader");
 domcreate_bootloader_done(egc, >bl, 0);
 } else  {
@@ -1370,7 +1370,7 @@ static void domcreate_bootloader_done(libxl__egc *egc,
 dcs->sdss.dm.callback = domcreate_devmodel_started;
 dcs->sdss.callback = domcreate_devmodel_started;
 
-if (restore_fd < 0 && !dcs->soft_reset) {
+if (restore_fd < 0 && !dcs->soft_reset && !d_config->dm_restore_file) {
 rc = libxl__domain_build(gc, d_config, domid, state);
 domcreate_rebuild_done(egc, dcs, rc);
 return;
-- 
2.25.1

[PATCH v19 for-4.14 07/13] tools/libxl: Adjust libxl__build_pre

2020-06-01 Thread Tamas K Lengyel

Skips parts not relevant for VM forks. No functional change in existing code,
only relocating some bits that don't need to be done at the very end.

Signed-off-by: Tamas K Lengyel 
---
 tools/libxl/libxl_dom.c | 23 ++-
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/tools/libxl/libxl_dom.c b/tools/libxl/libxl_dom.c
index dd1aff89a3..1b55097a1a 100644
--- a/tools/libxl/libxl_dom.c
+++ b/tools/libxl/libxl_dom.c
@@ -249,9 +249,12 @@ int libxl__build_pre(libxl__gc *gc, uint32_t domid,
 libxl_domain_build_info *const info = _config->b_info;
 libxl_ctx *ctx = libxl__gc_owner(gc);
 char *xs_domid, *con_domid;
-int rc;
+int rc = 0;
 uint64_t size;
 
+if (state->forked_vm)
+goto skip_fork;
+
 if (xc_domain_max_vcpus(ctx->xch, domid, info->max_vcpus) != 0) {
 LOG(ERROR, "Couldn't set max vcpu count");
 return ERROR_FAIL;
@@ -374,6 +377,16 @@ int libxl__build_pre(libxl__gc *gc, uint32_t domid,
 return ERROR_FAIL;
 }
 
+if ( (rc = libxl__arch_domain_create(gc, d_config, domid)) )
+return rc;
+
+/* Construct a CPUID policy, but only for brand new domains.  Domains
+ * being migrated-in/restored have CPUID handled during the
+ * static_data_done() callback. */
+if (!state->restore)
+libxl__cpuid_legacy(ctx, domid, info);
+
+skip_fork:
 xs_domid = xs_read(ctx->xsh, XBT_NULL, "/tool/xenstored/domid", NULL);
 state->store_domid = xs_domid ? atoi(xs_domid) : 0;
 free(xs_domid);
@@ -385,14 +398,6 @@ int libxl__build_pre(libxl__gc *gc, uint32_t domid,
 state->store_port = xc_evtchn_alloc_unbound(ctx->xch, domid, 
state->store_domid);
 state->console_port = xc_evtchn_alloc_unbound(ctx->xch, domid, 
state->console_domid);
 
-rc = libxl__arch_domain_create(gc, d_config, domid);
-
-/* Construct a CPUID policy, but only for brand new domains.  Domains
- * being migrated-in/restored have CPUID handled during the
- * static_data_done() callback. */
-if (!state->restore)
-libxl__cpuid_legacy(ctx, domid, info);
-
 return rc;
 }
 
-- 
2.25.1

[PATCH v19 for-4.14 11/13] tools/libxl: Add VM forking public functions

2020-06-01 Thread Tamas K Lengyel

Signed-off-by: Tamas K Lengyel 
---
 tools/libxl/libxl.h| 10 +
 tools/libxl/libxl_create.c | 44 ++
 2 files changed, 54 insertions(+)

diff --git a/tools/libxl/libxl.h b/tools/libxl/libxl.h
index 71709dc585..79792d6e29 100644
--- a/tools/libxl/libxl.h
+++ b/tools/libxl/libxl.h
@@ -2704,6 +2704,16 @@ static inline int 
libxl_qemu_monitor_command_0x041200(libxl_ctx *ctx,
  */
 int libxl_clear_domid_history(libxl_ctx *ctx);
 
+/*
+ * Experimental VM forking functions
+ */
+int libxl_domain_fork_vm(libxl_ctx *ctx, uint32_t pdomid, uint32_t *domid)
+ LIBXL_EXTERNAL_CALLERS_ONLY;
+
+int libxl_domain_fork_launch_dm(libxl_ctx *ctx, libxl_domain_config *d_config,
+uint32_t domid,
+const libxl_asyncprogress_how *aop_console_how)
+LIBXL_EXTERNAL_CALLERS_ONLY;
 #endif /* LIBXL_H */
 
 /*
diff --git a/tools/libxl/libxl_create.c b/tools/libxl/libxl_create.c
index 27f790cae1..9190e4e263 100644
--- a/tools/libxl/libxl_create.c
+++ b/tools/libxl/libxl_create.c
@@ -2339,6 +2339,50 @@ int libxl_domain_soft_reset(libxl_ctx *ctx,
 aop_console_how);
 }
 
+/*
+ * The parent domain is expected to be created with default settings for
+ * - max_evtch_port
+ * - max_grant_frames
+ * - max_maptrack_frames
+ */
+int libxl_domain_fork_vm(libxl_ctx *ctx, uint32_t pdomid, uint32_t *domid)
+{
+int rc;
+xc_dominfo_t info;
+struct xen_domctl_createdomain create = {0};
+
+if ( 1 != xc_domain_getinfo(ctx->xch, pdomid, 1, ) )
+return ERROR_INVAL;
+
+if ( info.domid != pdomid || !info.hvm || !info.hap )
+return ERROR_INVAL;
+
+create.flags |= XEN_DOMCTL_CDF_hvm;
+create.flags |= XEN_DOMCTL_CDF_hap;
+create.flags |= XEN_DOMCTL_CDF_oos_off;
+create.arch.emulation_flags = info.arch_config.emulation_flags;
+create.ssidref = info.ssidref;
+create.max_vcpus = info.max_vcpu_id + 1;
+create.max_evtchn_port = 1023;
+create.max_grant_frames = LIBXL_MAX_GRANT_FRAMES_DEFAULT;
+create.max_maptrack_frames = LIBXL_MAX_MAPTRACK_FRAMES_DEFAULT;
+
+if ( (rc = xc_domain_create(ctx->xch, domid, )) )
+return rc;
+
+if ( (rc = xc_memshr_fork(ctx->xch, pdomid, *domid, false, false)) )
+xc_domain_destroy(ctx->xch, *domid);
+
+return rc;
+}
+
+int libxl_domain_fork_launch_dm(libxl_ctx *ctx, libxl_domain_config *d_config,
+uint32_t domid,
+const libxl_asyncprogress_how *aop_console_how)
+{
+return do_domain_create(ctx, d_config, , -1, -1, 0, 0, 
aop_console_how);
+}
+
 /*
  * Local variables:
  * mode: C
-- 
2.25.1

[PATCH v19 for-4.14 10/13] tools/libxl: set QEMU saved_state from dm_restore_file

2020-06-01 Thread Tamas K Lengyel

And make sure we don't remove the file once done.

Signed-off-by: Tamas K Lengyel 
---
 tools/libxl/libxl_create.c | 4 
 tools/libxl/libxl_dm.c | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/tools/libxl/libxl_create.c b/tools/libxl/libxl_create.c
index ab3ac096ee..27f790cae1 100644
--- a/tools/libxl/libxl_create.c
+++ b/tools/libxl/libxl_create.c
@@ -1602,6 +1602,7 @@ static void domcreate_rebuild_done(libxl__egc *egc,
 /* convenience aliases */
 const uint32_t domid = dcs->guest_domid;
 libxl_domain_config *const d_config = dcs->guest_config;
+libxl__domain_build_state *const state = >build_state;
 
 if (ret) {
 LOGD(ERROR, domid, "cannot (re-)build domain: %d", ret);
@@ -1609,6 +1610,9 @@ static void domcreate_rebuild_done(libxl__egc *egc,
 goto error_out;
 }
 
+if (d_config->dm_restore_file)
+state->saved_state = GCSPRINTF("%s", d_config->dm_restore_file);
+
 store_libxl_entry(gc, domid, _config->b_info);
 
 libxl__multidev_begin(ao, >multidev);
diff --git a/tools/libxl/libxl_dm.c b/tools/libxl/libxl_dm.c
index f2dc5696b9..9b22836e12 100644
--- a/tools/libxl/libxl_dm.c
+++ b/tools/libxl/libxl_dm.c
@@ -3104,7 +3104,7 @@ static void device_model_spawn_outcome(libxl__egc *egc,
 
 libxl__domain_build_state *state = dmss->build_state;
 
-if (state->saved_state) {
+if (state->saved_state && !state->forked_vm) {
 ret2 = unlink(state->saved_state);
 if (ret2) {
 LOGED(ERROR, dmss->guest_domid, "%s: failed to remove device-model 
state %s",
-- 
2.25.1

[PATCH v19 for-4.14 13/13] tools/xl: document fork-vm command

2020-06-01 Thread Tamas K Lengyel

Signed-off-by: Tamas K Lengyel 
---
 docs/man/xl.1.pod.in | 39 +++
 1 file changed, 39 insertions(+)

diff --git a/docs/man/xl.1.pod.in b/docs/man/xl.1.pod.in
index 09339282e6..9e87b0314f 100644
--- a/docs/man/xl.1.pod.in
+++ b/docs/man/xl.1.pod.in
@@ -708,6 +708,45 @@ above).
 
 =back
 
+=item B [I] I
+
+Create a fork of a running VM.  The domain will be paused after the operation
+and remains paused while forks of it exist.  Experimental and x86 only.
+Forks can only be made of domains with HAP enabled and on Intel hardware.  The
+parent domain must be created with the xl toolstack and its configuration must
+not manually define max_grant_frames, max_maptrack_frames or 
max_event_channels.
+
+B
+
+=over 4
+
+=item B<-p>
+
+Leave the forked VM paused after creating it.  The parent always remains paused
+while there are forks active from it and that's enforced by the hypervisor.
+
+=item B<--launch-dm>
+
+Specify whether the device model (QEMU) should be launched for the fork.  Late
+launch allows to start the device model for an already running fork previously
+created with "--launch-dm no".
+
+=item B<-C>
+
+The config file to use when launching the device model.  Currently required 
when
+launching the device model.  Most config settings MUST match the parent domain
+exactly, only change VM name, disk path and network configurations.
+
+=item B<-Q>
+
+The path to the QEMU save file to use when launching the device model.  
Currently
+required when launching the device model.  Generate it by connecting to the 
parent
+domain's QMP socket and issuing:
+ { "execute": "qmp_capabilities" }
+ { "execute": "xen-save-devices-state", "arguments": { "filename": 
"/path/to/qemu.save", "live": false} }
+
+=back
+
 =item B [I]
 
 Display the number of shared pages for a specified domain. If no domain is
-- 
2.25.1

[PATCH v19 for-4.14 12/13] tools/xl: Add xl fork-vm command

2020-06-01 Thread Tamas K Lengyel

Adding the xl fork-vm command, compiled only on x86. Only the essential bits
are available via this command to create a fork and launch QEMU for it. The
command still allows to perform the task in a split-model, first creating the
fork and launching QEMU only later.

Signed-off-by: Tamas K Lengyel 
---
 tools/xl/Makefile   |   2 +-
 tools/xl/xl.h   |   4 ++
 tools/xl/xl_cmdtable.c  |  13 +
 tools/xl/xl_forkvm.c| 122 
 tools/xl/xl_vmcontrol.c |  13 +
 5 files changed, 153 insertions(+), 1 deletion(-)
 create mode 100644 tools/xl/xl_forkvm.c

diff --git a/tools/xl/Makefile b/tools/xl/Makefile
index af4912e67a..07333b 100644
--- a/tools/xl/Makefile
+++ b/tools/xl/Makefile
@@ -15,7 +15,7 @@ LDFLAGS += $(PTHREAD_LDFLAGS)
 CFLAGS_XL += $(CFLAGS_libxenlight)
 CFLAGS_XL += -Wshadow
 
-XL_OBJS-$(CONFIG_X86) = xl_psr.o
+XL_OBJS-$(CONFIG_X86) = xl_psr.o xl_forkvm.o
 XL_OBJS = xl.o xl_cmdtable.o xl_sxp.o xl_utils.o $(XL_OBJS-y)
 XL_OBJS += xl_parse.o xl_cpupool.o xl_flask.o
 XL_OBJS += xl_vtpm.o xl_block.o xl_nic.o xl_usb.o
diff --git a/tools/xl/xl.h b/tools/xl/xl.h
index 06569c6c4a..4b4442e875 100644
--- a/tools/xl/xl.h
+++ b/tools/xl/xl.h
@@ -50,6 +50,8 @@ struct domain_create {
 int migrate_fd; /* -1 means none */
 int send_back_fd; /* -1 means none */
 char **migration_domname_r; /* from malloc */
+uint32_t dm_restore_domid; /* restore dm for this domid */
+const char *dm_restore_file; /* path to dm restore file */
 };
 
 int create_domain(struct domain_create *dom_info);
@@ -131,6 +133,8 @@ int main_restore(int argc, char **argv);
 int main_migrate_receive(int argc, char **argv);
 int main_save(int argc, char **argv);
 int main_migrate(int argc, char **argv);
+int main_fork_vm(int argc, char **argv);
+int main_fork_launch_dm(int argc, char **argv);
 #endif
 int main_dump_core(int argc, char **argv);
 int main_pause(int argc, char **argv);
diff --git a/tools/xl/xl_cmdtable.c b/tools/xl/xl_cmdtable.c
index 08335394e5..523d955317 100644
--- a/tools/xl/xl_cmdtable.c
+++ b/tools/xl/xl_cmdtable.c
@@ -187,6 +187,19 @@ struct cmd_spec cmd_table[] = {
   "Restore a domain from a saved state",
   "- for internal use only",
 },
+#if defined(__i386__) || defined(__x86_64__)
+{ "fork-vm",
+  _fork_vm, 0, 1,
+  "Fork a domain from the running parent domid. Experimental. Most config 
settings must match parent.",
+  "[options] ",
+  "-h   Print this help.\n"
+  "-C   Use config file for VM fork.\n"
+  "-Q   Use qemu save file for VM fork.\n"
+  "--launch-dm Launch device model (QEMU) for VM fork 
(default yes).\n"
+  "-p   Do not unpause fork VM fork after 
operation.\n"
+  "-d   Enable debug messages.\n"
+},
+#endif
 #endif
 { "dump-core",
   _dump_core, 0, 1,
diff --git a/tools/xl/xl_forkvm.c b/tools/xl/xl_forkvm.c
new file mode 100644
index 00..5ab57ae41b
--- /dev/null
+++ b/tools/xl/xl_forkvm.c
@@ -0,0 +1,122 @@
+/*
+ * Copyright 2020 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; version 2.1 only. with the special
+ * exception on linking described in file LICENSE.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+#include 
+
+#include "xl.h"
+#include "xl_utils.h"
+#include "xl_parse.h"
+
+int main_fork_vm(int argc, char **argv)
+{
+int rc, debug = 0;
+uint32_t domid_in = INVALID_DOMID, domid_out = INVALID_DOMID;
+int launch_dm = 1;
+bool pause = 0;
+const char *config_file = NULL;
+const char *dm_restore_file = NULL;
+
+int opt;
+static struct option opts[] = {
+{"launch-dm", 1, 0, 'l'},
+COMMON_LONG_OPTS
+};
+
+SWITCH_FOREACH_OPT(opt, "phdC:Q:l:", opts, "fork-vm", 1) {
+case 'd':
+debug = 1;
+break;
+case 'p':
+pause = 1;
+break;
+case 'C':
+config_file = optarg;
+break;
+case 'Q':
+dm_restore_file = optarg;
+break;
+case 'l':
+if ( !strcmp(optarg, "no") )
+launch_dm = 0;
+if ( !strcmp(optarg, "yes") )
+launch_dm = 1;
+if ( !strcmp(optarg, "late") )
+launch_dm = 2;
+b

[PATCH v2 for-4.14 1/2] x86/mem_sharing: block interrupt injection for forks

2020-05-22 Thread Tamas K Lengyel

When running shallow forks without device models it may be undesirable for Xen
to inject interrupts. With Windows forks we have observed the kernel going into
infinite loops when trying to process such interrupts, likely because it 
attempts
to interact with devices that are not responding without QEMU running. By
disabling interrupt injection the fuzzer can exercise the target code without
interference.

Forks & memory sharing are only available on Intel CPUs so this only applies
to vmx.

Signed-off-by: Tamas K Lengyel 
---
v2: prohibit => block
minor style adjustments
---
 xen/arch/x86/hvm/vmx/intr.c  | 6 ++
 xen/arch/x86/mm/mem_sharing.c| 6 +-
 xen/include/asm-x86/hvm/domain.h | 2 ++
 xen/include/public/memory.h  | 1 +
 4 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/xen/arch/x86/hvm/vmx/intr.c b/xen/arch/x86/hvm/vmx/intr.c
index 000e14af49..80bfbb4787 100644
--- a/xen/arch/x86/hvm/vmx/intr.c
+++ b/xen/arch/x86/hvm/vmx/intr.c
@@ -256,6 +256,12 @@ void vmx_intr_assist(void)
 if ( unlikely(v->arch.vm_event) && v->arch.vm_event->sync_event )
 return;
 
+#ifdef CONFIG_MEM_SHARING
+/* Block event injection for VM fork if requested */
+if ( unlikely(v->domain->arch.hvm.mem_sharing.block_interrupts) )
+return;
+#endif
+
 /* Crank the handle on interrupt state. */
 pt_vector = pt_update_irq(v);
 
diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
index 7271e5c90b..0c45a8d67e 100644
--- a/xen/arch/x86/mm/mem_sharing.c
+++ b/xen/arch/x86/mm/mem_sharing.c
@@ -2106,7 +2106,8 @@ int 
mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg)
 rc = -EINVAL;
 if ( mso.u.fork.pad )
 goto out;
-if ( mso.u.fork.flags & ~XENMEM_FORK_WITH_IOMMU_ALLOWED )
+if ( mso.u.fork.flags &
+ ~(XENMEM_FORK_WITH_IOMMU_ALLOWED | XENMEM_FORK_BLOCK_INTERRUPTS) )
 goto out;
 
 rc = rcu_lock_live_remote_domain_by_id(mso.u.fork.parent_domain,
@@ -2134,6 +2135,9 @@ int 
mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg)
 rc = hypercall_create_continuation(__HYPERVISOR_memory_op,
"lh", XENMEM_sharing_op,
arg);
+else if ( !rc && (mso.u.fork.flags & XENMEM_FORK_BLOCK_INTERRUPTS) )
+d->arch.hvm.mem_sharing.block_interrupts = true;
+
 rcu_unlock_domain(pd);
 break;
 }
diff --git a/xen/include/asm-x86/hvm/domain.h b/xen/include/asm-x86/hvm/domain.h
index 95fe18cddc..37e494d234 100644
--- a/xen/include/asm-x86/hvm/domain.h
+++ b/xen/include/asm-x86/hvm/domain.h
@@ -74,6 +74,8 @@ struct mem_sharing_domain
  * to resume the search.
  */
 unsigned long next_shared_gfn_to_relinquish;
+
+bool block_interrupts;
 };
 #endif
 
diff --git a/xen/include/public/memory.h b/xen/include/public/memory.h
index dbd35305df..1e4959638d 100644
--- a/xen/include/public/memory.h
+++ b/xen/include/public/memory.h
@@ -537,6 +537,7 @@ struct xen_mem_sharing_op {
 struct mem_sharing_op_fork {  /* OP_FORK */
 domid_t parent_domain;/* IN: parent's domain id */
 #define XENMEM_FORK_WITH_IOMMU_ALLOWED (1u << 0)
+#define XENMEM_FORK_BLOCK_INTERRUPTS   (1u << 1)
 uint16_t flags;   /* IN: optional settings */
 uint32_t pad; /* Must be set to 0 */
 } fork;
-- 
2.25.1

[PATCH v2 for-4.14 2/2] tools/libxc: xc_memshr_fork with interrupts blocked

2020-05-22 Thread Tamas K Lengyel

Toolstack side for creating forks with interrupt injection blocked.

Signed-off-by: Tamas K Lengyel 
Reviewed-by: Roger Pau Monné 
Acked-by: Ian Jackson 
---
 tools/libxc/include/xenctrl.h | 3 ++-
 tools/libxc/xc_memshr.c   | 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/tools/libxc/include/xenctrl.h b/tools/libxc/include/xenctrl.h
index 45ff7db1e8..804ff001d7 100644
--- a/tools/libxc/include/xenctrl.h
+++ b/tools/libxc/include/xenctrl.h
@@ -2242,7 +2242,8 @@ int xc_memshr_range_share(xc_interface *xch,
 int xc_memshr_fork(xc_interface *xch,
uint32_t source_domain,
uint32_t client_domain,
-   bool allow_with_iommu);
+   bool allow_with_iommu,
+   bool block_interrupts);
 
 /*
  * Note: this function is only intended to be used on short-lived forks that
diff --git a/tools/libxc/xc_memshr.c b/tools/libxc/xc_memshr.c
index 2300cc7075..a6cfd7dccf 100644
--- a/tools/libxc/xc_memshr.c
+++ b/tools/libxc/xc_memshr.c
@@ -240,7 +240,7 @@ int xc_memshr_debug_gref(xc_interface *xch,
 }
 
 int xc_memshr_fork(xc_interface *xch, uint32_t pdomid, uint32_t domid,
-   bool allow_with_iommu)
+   bool allow_with_iommu, bool block_interrupts)
 {
 xen_mem_sharing_op_t mso;
 
@@ -251,6 +251,8 @@ int xc_memshr_fork(xc_interface *xch, uint32_t pdomid, 
uint32_t domid,
 
 if ( allow_with_iommu )
 mso.u.fork.flags |= XENMEM_FORK_WITH_IOMMU_ALLOWED;
+if ( block_interrupts )
+mso.u.fork.flags |= XENMEM_FORK_BLOCK_INTERRUPTS;
 
 return xc_memshr_memop(xch, domid, );
 }
-- 
2.25.1

[PATCH v3 for-4.14] x86/monitor: revert default behavior when monitoring register write events

2020-06-02 Thread Tamas K Lengyel

For the last couple years we have received numerous reports from users of
monitor vm_events of spurious guest crashes when using events. In particular,
it has observed that the problem occurs when vm_events are being disabled. The
nature of the guest crash varied widely and has only occured occasionally. This
made debugging the issue particularly hard. We had discussions about this issue
even here on the xen-devel mailinglist with no luck figuring it out.

The bug has now been identified as a race-condition between register event
handling and disabling the monitor vm_event interface.

Patch 96760e2fba100d694300a81baddb5740e0f8c0ee, "vm_event: deny register writes
if refused by  vm_event reply" is the patch that introduced the error. In this
patch the default behavior regarding emulation of register write events is
changed so that they get postponed until the corresponding vm_event handler
decides whether to allow such write to take place. Unfortunately this can only
be implemented by performing the deny/allow step when the vCPU gets scheduled.
Due to that postponed emulation of the event if the user decides to pause the
VM in the vm_event handler and then disable events, the entire emulation step
is skipped the next time the vCPU is resumed. Even if the user doesn't pause
during the vm_event handling but exits immediately and disables vm_event, the
situation becomes racey as disabling vm_event may succeed before the guest's
vCPUs get scheduled with the pending emulation task. This has been particularly
the case with VMS that have several vCPUs as after the VM is unpaused it may
actually take a long time before all vCPUs get scheduled.

In this patch we are reverting the default behavior to always perform emulation
of register write events when the event occurs. To postpone them can be turned
on as an option. In that case the user of the interface still has to take care
of only disabling the interface when its safe as it remains buggy.

Signed-off-by: Tamas K Lengyel 
---
 xen/arch/x86/hvm/hvm.c| 14 --
 xen/arch/x86/hvm/monitor.c| 13 -
 xen/arch/x86/monitor.c| 10 +-
 xen/include/asm-x86/domain.h  |  1 +
 xen/include/asm-x86/hvm/monitor.h |  7 +++
 xen/include/public/domctl.h   |  1 +
 6 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index 74c9f84462..5bb47583b3 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -3601,13 +3601,15 @@ int hvm_msr_write_intercept(unsigned int msr, uint64_t 
msr_content,
 
 ASSERT(v->arch.vm_event);
 
-/* The actual write will occur in hvm_do_resume() (if permitted). */
-v->arch.vm_event->write_data.do_write.msr = 1;
-v->arch.vm_event->write_data.msr = msr;
-v->arch.vm_event->write_data.value = msr_content;
+if ( hvm_monitor_msr(msr, msr_content, msr_old_content) )
+{
+/* The actual write will occur in hvm_do_resume(), if permitted. */
+v->arch.vm_event->write_data.do_write.msr = 1;
+v->arch.vm_event->write_data.msr = msr;
+v->arch.vm_event->write_data.value = msr_content;
 
-hvm_monitor_msr(msr, msr_content, msr_old_content);
-return X86EMUL_OKAY;
+return X86EMUL_OKAY;
+}
 }
 
 if ( (ret = guest_wrmsr(v, msr, msr_content)) != X86EMUL_UNHANDLEABLE )
diff --git a/xen/arch/x86/hvm/monitor.c b/xen/arch/x86/hvm/monitor.c
index 8aa14137e2..36894b33a4 100644
--- a/xen/arch/x86/hvm/monitor.c
+++ b/xen/arch/x86/hvm/monitor.c
@@ -53,11 +53,11 @@ bool hvm_monitor_cr(unsigned int index, unsigned long 
value, unsigned long old)
 .u.write_ctrlreg.old_value = old
 };
 
-if ( monitor_traps(curr, sync, ) >= 0 )
-return 1;
+return monitor_traps(curr, sync, ) >= 0 &&
+curr->domain->arch.monitor.control_register_values;
 }
 
-return 0;
+return false;
 }
 
 bool hvm_monitor_emul_unimplemented(void)
@@ -77,7 +77,7 @@ bool hvm_monitor_emul_unimplemented(void)
 monitor_traps(curr, true, ) == 1;
 }
 
-void hvm_monitor_msr(unsigned int msr, uint64_t new_value, uint64_t old_value)
+bool hvm_monitor_msr(unsigned int msr, uint64_t new_value, uint64_t old_value)
 {
 struct vcpu *curr = current;
 
@@ -92,8 +92,11 @@ void hvm_monitor_msr(unsigned int msr, uint64_t new_value, 
uint64_t old_value)
 .u.mov_to_msr.old_value = old_value
 };
 
-monitor_traps(curr, 1, );
+return monitor_traps(curr, 1, ) >= 0 &&
+curr->domain->arch.monitor.control_register_values;
 }
+
+return false;
 }
 
 void hvm_monitor_descriptor_access(uint64_t exit_info,
diff --git a/xen/arch/x86/monitor.c b/xen/arch/x86/monitor.c
index bbcb7536c7..1517a97f50 100644
--- a/xen/arch/x86/monitor.c
+++ b/xen/

Re: [PATCH v2 for-4.14 2/3] xen/vm_event: add vm_event_check_pending_op

2020-06-02 Thread Tamas K Lengyel

On Tue, Jun 2, 2020 at 5:47 AM Roger Pau Monné  wrote:
>
> On Wed, May 20, 2020 at 08:31:53PM -0600, Tamas K Lengyel wrote:
> > Perform sanity checking when shutting vm_event down to determine whether
> > it is safe to do so. Error out with -EAGAIN in case pending operations
> > have been found for the domain.
> >
> > Signed-off-by: Tamas K Lengyel 
> > ---
> >  xen/arch/x86/vm_event.c| 23 +++
> >  xen/common/vm_event.c  | 17 ++---
> >  xen/include/asm-arm/vm_event.h |  7 +++
> >  xen/include/asm-x86/vm_event.h |  2 ++
> >  4 files changed, 46 insertions(+), 3 deletions(-)
> >
> > diff --git a/xen/arch/x86/vm_event.c b/xen/arch/x86/vm_event.c
> > index 848d69c1b0..a23aadc112 100644
> > --- a/xen/arch/x86/vm_event.c
> > +++ b/xen/arch/x86/vm_event.c
> > @@ -297,6 +297,29 @@ void vm_event_emulate_check(struct vcpu *v, 
> > vm_event_response_t *rsp)
> >  };
> >  }
> >
> > +bool vm_event_check_pending_op(const struct vcpu *v)
> > +{
> > +struct monitor_write_data *w = >arch.vm_event->write_data;
>
> const
>
> > +
> > +if ( !v->arch.vm_event->sync_event )
> > +return false;
> > +
> > +if ( w->do_write.cr0 )
> > +return true;
> > +if ( w->do_write.cr3 )
> > +return true;
> > +if ( w->do_write.cr4 )
> > +return true;
> > +if ( w->do_write.msr )
> > +return true;
> > +if ( v->arch.vm_event->set_gprs )
> > +return true;
> > +if ( v->arch.vm_event->emulate_flags )
> > +return true;
>
> Can you please group this into a single if, ie:
>
> if ( w->do_write.cr0 || w->do_write.cr3 || ... )
> return true;
>
> > +
> > +return false;
> > +}
> > +
> >  /*
> >   * Local variables:
> >   * mode: C
> > diff --git a/xen/common/vm_event.c b/xen/common/vm_event.c
> > index 127f2d58f1..2df327a42c 100644
> > --- a/xen/common/vm_event.c
> > +++ b/xen/common/vm_event.c
> > @@ -183,6 +183,7 @@ static int vm_event_disable(struct domain *d, struct 
> > vm_event_domain **p_ved)
> >  if ( vm_event_check_ring(ved) )
> >  {
> >  struct vcpu *v;
> > +bool pending_op = false;
> >
> >  spin_lock(>lock);
> >
> > @@ -192,9 +193,6 @@ static int vm_event_disable(struct domain *d, struct 
> > vm_event_domain **p_ved)
> >  return -EBUSY;
> >  }
> >
> > -/* Free domU's event channel and leave the other one unbound */
> > -free_xen_event_channel(d, ved->xen_port);
> > -
> >  /* Unblock all vCPUs */
> >  for_each_vcpu ( d, v )
> >  {
> > @@ -203,8 +201,21 @@ static int vm_event_disable(struct domain *d, struct 
> > vm_event_domain **p_ved)
> >  vcpu_unpause(v);
> >  ved->blocked--;
> >  }
> > +
> > +if ( vm_event_check_pending_op(v) )
> > +pending_op = true;
>
> You could just do:
>
> pending_op |= vm_event_check_pending_op(v);
>
> and avoid the initialization of pending_op above. Or alternatively:
>
> if ( !pending_op && vm_event_check_pending_op(v) )
> pending_op = true;
>
> Which avoid repeated calls to vm_event_check_pending_op when at least
> one vCPU is known to be busy.
>
> >  }
> >
> > +/* vm_event ops are still pending until vCPUs get scheduled */
> > +if ( pending_op )
> > +{
> > +spin_unlock(>lock);
> > +return -EAGAIN;
>
> What happens when this gets called from vm_event_cleanup?
>
> AFAICT the result there is ignored, and could leak the vm_event
> allocated memory?

Thanks for the feedback. I'm going to drop this patch at let
Bitdefender pick it up if they feel like fixing their buggy feature.
As things stand for my use-case I only need patch 1 from this series.

Tamas

Re: [PATCH v2 for-4.14 1/3] xen/monitor: Control register values

2020-06-02 Thread Tamas K Lengyel

On Tue, Jun 2, 2020 at 7:00 AM Jan Beulich  wrote:
>
> On 02.06.2020 14:51, Tamas K Lengyel wrote:
> > On Tue, Jun 2, 2020 at 6:47 AM Jan Beulich  wrote:
> >>
> >> On 02.06.2020 14:40, Tamas K Lengyel wrote:
> >>> On Tue, Jun 2, 2020 at 5:08 AM Roger Pau Monné  
> >>> wrote:
> >>>>
> >>>> On Wed, May 20, 2020 at 08:31:52PM -0600, Tamas K Lengyel wrote:
> >>>>> Extend the monitor_op domctl to include option that enables
> >>>>> controlling what values certain registers are permitted to hold
> >>>>> by a monitor subscriber.
> >>>>
> >>>> I think the change could benefit for some more detail commit message
> >>>> here. Why is this useful?
> >>>
> >>> You would have to ask the Bitdefender folks who made the feature. I
> >>> don't use it. Here we are just making it optional as it is buggy so it
> >>> is disabled by default.
> >>
> >> Now that's exactly the opposite of what I had derived from the
> >> description here so far. Perhaps an at least weak indication
> >> that you want to reword this. For example, from your reply to
> >> Roger I understand it's rather that the new flag allows to
> >> "suppress" the controlling (since presumably you don't change
> >> default behavior), rather then "enabling" it.
> >
> > What we are adding is a domctl you need to call that enables this
> > feature. It's not an option to suppress it. It shouldn't have been
> > enabled by default to begin with. That was a mistake when the feature
> > was contributed and it is buggy.
>
> Okay, in this case it's important to point out that you alter
> default behavior. The BitDefender folks may not like this, yet
> they've been surprisingly silent so far.

Well, it was Bitdefender who altered the default behavior. We are
reverting their mistake and making it optional. But I can certainly
make that more clear.

Tamas

Re: [PATCH v2 for-4.14 1/3] xen/monitor: Control register values

2020-06-02 Thread Tamas K Lengyel

On Tue, Jun 2, 2020 at 5:08 AM Roger Pau Monné  wrote:
>
> On Wed, May 20, 2020 at 08:31:52PM -0600, Tamas K Lengyel wrote:
> > Extend the monitor_op domctl to include option that enables
> > controlling what values certain registers are permitted to hold
> > by a monitor subscriber.
>
> I think the change could benefit for some more detail commit message
> here. Why is this useful?

You would have to ask the Bitdefender folks who made the feature. I
don't use it. Here we are just making it optional as it is buggy so it
is disabled by default.

>
> There already seems to be some support for gating MSR writes, which
> seems to be expanded by this commit?

We don't expand on any existing features, we make an existing feature optional.

>
> Is it solving some kind of bug reported?

It does, please take a look at the cover letter.

>
> > Signed-off-by: Tamas K Lengyel 
> > ---
> >  xen/arch/x86/hvm/hvm.c   | 25 -
> >  xen/arch/x86/monitor.c   | 10 +-
> >  xen/include/asm-x86/domain.h |  1 +
> >  xen/include/public/domctl.h  |  1 +
> >  4 files changed, 27 insertions(+), 10 deletions(-)
> >
> > diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
> > index 09ee299bc7..e6780c685b 100644
> > --- a/xen/arch/x86/hvm/hvm.c
> > +++ b/xen/arch/x86/hvm/hvm.c
> > @@ -2263,7 +2263,8 @@ int hvm_set_cr0(unsigned long value, bool may_defer)
> >  {
> >  ASSERT(v->arch.vm_event);
> >
> > -if ( hvm_monitor_crX(CR0, value, old_value) )
> > +if ( hvm_monitor_crX(CR0, value, old_value) &&
> > + v->domain->arch.monitor.control_register_values )
> >  {
> >  /* The actual write will occur in hvm_do_resume(), if 
> > permitted. */
> >  v->arch.vm_event->write_data.do_write.cr0 = 1;
> > @@ -2362,7 +2363,8 @@ int hvm_set_cr3(unsigned long value, bool may_defer)
> >  {
> >  ASSERT(v->arch.vm_event);
> >
> > -if ( hvm_monitor_crX(CR3, value, old) )
> > +if ( hvm_monitor_crX(CR3, value, old) &&
> > + v->domain->arch.monitor.control_register_values )
> >  {
> >  /* The actual write will occur in hvm_do_resume(), if 
> > permitted. */
> >  v->arch.vm_event->write_data.do_write.cr3 = 1;
> > @@ -2443,7 +2445,8 @@ int hvm_set_cr4(unsigned long value, bool may_defer)
> >  {
> >  ASSERT(v->arch.vm_event);
> >
> > -if ( hvm_monitor_crX(CR4, value, old_cr) )
> > +if ( hvm_monitor_crX(CR4, value, old_cr) &&
> > + v->domain->arch.monitor.control_register_values )
>
> I think you could return control_register_values in hvm_monitor_crX
> instead of having to add the check to each caller?

We could, but this way the code is more consistent.

>
> >  {
> >  /* The actual write will occur in hvm_do_resume(), if 
> > permitted. */
> >  v->arch.vm_event->write_data.do_write.cr4 = 1;
> > @@ -3587,13 +3590,17 @@ int hvm_msr_write_intercept(unsigned int msr, 
> > uint64_t msr_content,
> >
> >  ASSERT(v->arch.vm_event);
> >
> > -/* The actual write will occur in hvm_do_resume() (if permitted). 
> > */
> > -v->arch.vm_event->write_data.do_write.msr = 1;
> > -v->arch.vm_event->write_data.msr = msr;
> > -v->arch.vm_event->write_data.value = msr_content;
> > -
> >  hvm_monitor_msr(msr, msr_content, msr_old_content);
> > -return X86EMUL_OKAY;
> > +
> > +if ( v->domain->arch.monitor.control_register_values )
>
> Is there any value in limiting control_register_values to MSR that
> represent control registers, like EFER and XSS?

I don't know, you would have to ask Bitdefender about it who made this feature.

>
> > +{
> > +/* The actual write will occur in hvm_do_resume(), if 
> > permitted. */
> > +v->arch.vm_event->write_data.do_write.msr = 1;
> > +v->arch.vm_event->write_data.msr = msr;
> > +v->arch.vm_event->write_data.value = msr_content;
> > +
> > +return X86EMUL_OKAY;
> > +}
>
> You seem to change the previous flow of the function here, that would
> just call hvm_monitor_msr and return previously.
>
> Don't you need to move the return from outside the added if condition
> in order to keep previous behavior? Or else the write is committed
> straight a

Re: [PATCH v2 for-4.14 1/3] xen/monitor: Control register values

2020-06-02 Thread Tamas K Lengyel

On Tue, Jun 2, 2020 at 7:04 AM Jan Beulich  wrote:
>
> On 02.06.2020 15:01, Roger Pau Monné wrote:
> > On Tue, Jun 02, 2020 at 06:40:12AM -0600, Tamas K Lengyel wrote:
> >> On Tue, Jun 2, 2020 at 5:08 AM Roger Pau Monné  
> >> wrote:
> >>> On Wed, May 20, 2020 at 08:31:52PM -0600, Tamas K Lengyel wrote:
> >>>> --- a/xen/arch/x86/hvm/hvm.c
> >>>> +++ b/xen/arch/x86/hvm/hvm.c
> >>>> @@ -2263,7 +2263,8 @@ int hvm_set_cr0(unsigned long value, bool 
> >>>> may_defer)
> >>>>  {
> >>>>  ASSERT(v->arch.vm_event);
> >>>>
> >>>> -if ( hvm_monitor_crX(CR0, value, old_value) )
> >>>> +if ( hvm_monitor_crX(CR0, value, old_value) &&
> >>>> + v->domain->arch.monitor.control_register_values )
> >>>>  {
> >>>>  /* The actual write will occur in hvm_do_resume(), if 
> >>>> permitted. */
> >>>>  v->arch.vm_event->write_data.do_write.cr0 = 1;
> >>>> @@ -2362,7 +2363,8 @@ int hvm_set_cr3(unsigned long value, bool 
> >>>> may_defer)
> >>>>  {
> >>>>  ASSERT(v->arch.vm_event);
> >>>>
> >>>> -if ( hvm_monitor_crX(CR3, value, old) )
> >>>> +if ( hvm_monitor_crX(CR3, value, old) &&
> >>>> + v->domain->arch.monitor.control_register_values )
> >>>>  {
> >>>>  /* The actual write will occur in hvm_do_resume(), if 
> >>>> permitted. */
> >>>>  v->arch.vm_event->write_data.do_write.cr3 = 1;
> >>>> @@ -2443,7 +2445,8 @@ int hvm_set_cr4(unsigned long value, bool 
> >>>> may_defer)
> >>>>  {
> >>>>  ASSERT(v->arch.vm_event);
> >>>>
> >>>> -if ( hvm_monitor_crX(CR4, value, old_cr) )
> >>>> +if ( hvm_monitor_crX(CR4, value, old_cr) &&
> >>>> + v->domain->arch.monitor.control_register_values )
> >>>
> >>> I think you could return control_register_values in hvm_monitor_crX
> >>> instead of having to add the check to each caller?
> >>
> >> We could, but this way the code is more consistent.
> >
> > OK, I guess it's a matter of taste. I would rather prefer those checks
> > to be confined to hvm_monitor_crX because then the generic code is not
> > polluted with monitor checks, but that's likely just my taste.
>
> +1


OK.

< 3 4 5 6 7 8 9 10 11 12 >

701 - 800 of 1104 matches

Mail list logo