Re: [Xen-devel] [PATCH v3] x86/mm: Suppresses vm_events caused by page-walks

2018-03-01 Thread Razvan Cojocaru


On 03/01/2018 12:09 PM, Jan Beulich wrote:
 On 01.03.18 at 11:00,  wrote:
>> On 02/28/2018 03:56 PM, Jan Beulich wrote:
>> On 28.02.18 at 14:41,  wrote:
>>> On 28.02.18 at 14:25,  wrote:
> --- a/xen/arch/x86/mm/mem_access.c
> +++ b/xen/arch/x86/mm/mem_access.c
> @@ -137,6 +137,23 @@ bool p2m_mem_access_emulate_check(struct vcpu *v,
>  return violation;
>  }
>  
> +static void p2m_set_ad_bits(struct vcpu *v, paddr_t ga)
> +{
> +struct hvm_hw_cpu ctxt;
> +uint32_t pfec = 0;
> +
> +hvm_funcs.save_cpu_ctxt(v, );
> +
> +if ( guest_cpu_user_regs()->eip == v->arch.pg_dirty.eip
> + && ga == v->arch.pg_dirty.gla )
> +pfec = PFEC_write_access;
> +
> +paging_ga_to_gfn_cr3(v, ctxt.cr3, ga, , NULL);
> +
> +v->arch.pg_dirty.eip = guest_cpu_user_regs()->eip;
> +v->arch.pg_dirty.gla = ga;
> +}

 This being the only user of v->arch.pg_dirty, why is the new
 sub-structure made part of struct arch_vcpu instead of
 struct arch_vm_event (or some other sub-structure referenced
 by pointer, such that struct arch_vcpu wouldn't grow in size?
 And even without that, this is HVM-specific, so would at best
 belong into the HVM sub-structure.

 The PFEC_write_access logic is completely unclear to me, despite
 the attempt to describe this in the description. I'm pretty sure you
 want a code comment here.
>>
>> The thinking here is this: if we've ended up in p2m_mem_access_check()
>> with npfec.kind != npfec_kind_with_gla, then that's an EPT fault caused
>> by a page walk, that can be performed "manually" once Xen tries to set
>> both the A and the D bits.
>>
>> So when it tries to set the A bit, we mark the attempt by storing the
>> RIP/GLA pair, so that when the function is called again for the same
>> values we know that that's an attempt to set the D bit, and we act on
>> that (so that we don't have to lift the page restrictions on the fault
>> page).
>>
>> If there's a cleaner way to achieve this it would be great.
> 
> "Cleaner" is a secondary goal. A correct way would be desirable
> for starters. I don't see what would prevent coming back here a
> second time because something somewhere having returned
> X86EMUL_RETRY, causing the insn to simply be restarted. This
> _may_ be possible to address by suitably flushing the two values
> under certain conditions.

Clearly corectness should be the goal, however what I had in mind was
that if there was some way we could know that the D bit is being set
without trying again, then this would be both cleaner and obviously correct.

As for X86EMUL_RETRY, one of the points of the patch is that on this
path no vm_event is being sent out at all, so no emulation attempt is
taking place (at least not on the vm_event processing path):

 bool p2m_mem_access_check(paddr_t gpa, unsigned long gla,
   struct npfec npfec,
   vm_event_request_t **req_ptr)
@@ -208,6 +225,16 @@ bool p2m_mem_access_check(paddr_t gpa, unsigned
long gla,
 }
 }

+if ( vm_event_check_ring(d->vm_event_monitor) &&
+ d->arch.monitor.inguest_pagefault_disabled &&
+ npfec.kind != npfec_kind_with_gla ) /* don't send a mem_event */
+{
+v->arch.vm_event->emulate_flags = 0;
+p2m_set_ad_bits(v, gla);
+
+return true;
+}
+
 *req_ptr = NULL; /* Vm_event path starts here. */
 req = xzalloc(vm_event_request_t);

Of course, that doesn't mean that there's no other possiblity to enter
p2m_set_ad_bits() twice in a way I'm not seeing now.


Thanks,
Razvan

___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v3] x86/mm: Suppresses vm_events caused by page-walks

2018-03-01 Thread Jan Beulich
>>> On 01.03.18 at 11:00,  wrote:
> On 02/28/2018 03:56 PM, Jan Beulich wrote:
> On 28.02.18 at 14:41,  wrote:
>> On 28.02.18 at 14:25,  wrote:
 --- a/xen/arch/x86/mm/mem_access.c
 +++ b/xen/arch/x86/mm/mem_access.c
 @@ -137,6 +137,23 @@ bool p2m_mem_access_emulate_check(struct vcpu *v,
  return violation;
  }
  
 +static void p2m_set_ad_bits(struct vcpu *v, paddr_t ga)
 +{
 +struct hvm_hw_cpu ctxt;
 +uint32_t pfec = 0;
 +
 +hvm_funcs.save_cpu_ctxt(v, );
 +
 +if ( guest_cpu_user_regs()->eip == v->arch.pg_dirty.eip
 + && ga == v->arch.pg_dirty.gla )
 +pfec = PFEC_write_access;
 +
 +paging_ga_to_gfn_cr3(v, ctxt.cr3, ga, , NULL);
 +
 +v->arch.pg_dirty.eip = guest_cpu_user_regs()->eip;
 +v->arch.pg_dirty.gla = ga;
 +}
>>>
>>> This being the only user of v->arch.pg_dirty, why is the new
>>> sub-structure made part of struct arch_vcpu instead of
>>> struct arch_vm_event (or some other sub-structure referenced
>>> by pointer, such that struct arch_vcpu wouldn't grow in size?
>>> And even without that, this is HVM-specific, so would at best
>>> belong into the HVM sub-structure.
>>>
>>> The PFEC_write_access logic is completely unclear to me, despite
>>> the attempt to describe this in the description. I'm pretty sure you
>>> want a code comment here.
> 
> The thinking here is this: if we've ended up in p2m_mem_access_check()
> with npfec.kind != npfec_kind_with_gla, then that's an EPT fault caused
> by a page walk, that can be performed "manually" once Xen tries to set
> both the A and the D bits.
> 
> So when it tries to set the A bit, we mark the attempt by storing the
> RIP/GLA pair, so that when the function is called again for the same
> values we know that that's an attempt to set the D bit, and we act on
> that (so that we don't have to lift the page restrictions on the fault
> page).
> 
> If there's a cleaner way to achieve this it would be great.

"Cleaner" is a secondary goal. A correct way would be desirable
for starters. I don't see what would prevent coming back here a
second time because something somewhere having returned
X86EMUL_RETRY, causing the insn to simply be restarted. This
_may_ be possible to address by suitably flushing the two values
under certain conditions.

>>> What if the first pass through this function is with guest EIP zero
>>> and gla also zero?
> 
> Is that possible? If it really is, we could use another default value,
> for example ~0UL (INVALID_GFN?).

Except that we're talking about a GLA here, and ~0UL is a
valid linear address. Some non-canonical address _may_ be
suitable, but I'm not sure.

Jan


___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v3] x86/mm: Suppresses vm_events caused by page-walks

2018-03-01 Thread Razvan Cojocaru
On 02/28/2018 03:56 PM, Jan Beulich wrote:
 On 28.02.18 at 14:41,  wrote:
> On 28.02.18 at 14:25,  wrote:
>>> --- a/xen/arch/x86/mm/mem_access.c
>>> +++ b/xen/arch/x86/mm/mem_access.c
>>> @@ -137,6 +137,23 @@ bool p2m_mem_access_emulate_check(struct vcpu *v,
>>>  return violation;
>>>  }
>>>  
>>> +static void p2m_set_ad_bits(struct vcpu *v, paddr_t ga)
>>> +{
>>> +struct hvm_hw_cpu ctxt;
>>> +uint32_t pfec = 0;
>>> +
>>> +hvm_funcs.save_cpu_ctxt(v, );
>>> +
>>> +if ( guest_cpu_user_regs()->eip == v->arch.pg_dirty.eip
>>> + && ga == v->arch.pg_dirty.gla )
>>> +pfec = PFEC_write_access;
>>> +
>>> +paging_ga_to_gfn_cr3(v, ctxt.cr3, ga, , NULL);
>>> +
>>> +v->arch.pg_dirty.eip = guest_cpu_user_regs()->eip;
>>> +v->arch.pg_dirty.gla = ga;
>>> +}
>>
>> This being the only user of v->arch.pg_dirty, why is the new
>> sub-structure made part of struct arch_vcpu instead of
>> struct arch_vm_event (or some other sub-structure referenced
>> by pointer, such that struct arch_vcpu wouldn't grow in size?
>> And even without that, this is HVM-specific, so would at best
>> belong into the HVM sub-structure.
>>
>> The PFEC_write_access logic is completely unclear to me, despite
>> the attempt to describe this in the description. I'm pretty sure you
>> want a code comment here.

The thinking here is this: if we've ended up in p2m_mem_access_check()
with npfec.kind != npfec_kind_with_gla, then that's an EPT fault caused
by a page walk, that can be performed "manually" once Xen tries to set
both the A and the D bits.

So when it tries to set the A bit, we mark the attempt by storing the
RIP/GLA pair, so that when the function is called again for the same
values we know that that's an attempt to set the D bit, and we act on
that (so that we don't have to lift the page restrictions on the fault
page).

If there's a cleaner way to achieve this it would be great.

>> What if the first pass through this function is with guest EIP zero
>> and gla also zero?

Is that possible? If it really is, we could use another default value,
for example ~0UL (INVALID_GFN?).

> I should have added here: If the vCPU is paused, there surely
> should be a cheaper way. If the vCPU is not paused, the value
> read is stale by the time you use it.

The VCPU is paused. We'll look at a lighter way to extract CR3.


Thanks,
Razvan

___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v3] x86/mm: Suppresses vm_events caused by page-walks

2018-02-28 Thread Jan Beulich
>>> On 28.02.18 at 14:41,  wrote:
 On 28.02.18 at 14:25,  wrote:
>> --- a/xen/arch/x86/mm/mem_access.c
>> +++ b/xen/arch/x86/mm/mem_access.c
>> @@ -137,6 +137,23 @@ bool p2m_mem_access_emulate_check(struct vcpu *v,
>>  return violation;
>>  }
>>  
>> +static void p2m_set_ad_bits(struct vcpu *v, paddr_t ga)
>> +{
>> +struct hvm_hw_cpu ctxt;
>> +uint32_t pfec = 0;
>> +
>> +hvm_funcs.save_cpu_ctxt(v, );
>> +
>> +if ( guest_cpu_user_regs()->eip == v->arch.pg_dirty.eip
>> + && ga == v->arch.pg_dirty.gla )
>> +pfec = PFEC_write_access;
>> +
>> +paging_ga_to_gfn_cr3(v, ctxt.cr3, ga, , NULL);
>> +
>> +v->arch.pg_dirty.eip = guest_cpu_user_regs()->eip;
>> +v->arch.pg_dirty.gla = ga;
>> +}
> 
> This being the only user of v->arch.pg_dirty, why is the new
> sub-structure made part of struct arch_vcpu instead of
> struct arch_vm_event (or some other sub-structure referenced
> by pointer, such that struct arch_vcpu wouldn't grow in size?
> And even without that, this is HVM-specific, so would at best
> belong into the HVM sub-structure.
> 
> The PFEC_write_access logic is completely unclear to me, despite
> the attempt to describe this in the description. I'm pretty sure you
> want a code comment here.
> 
> Why is the function argument named ga instead of gla (thus
> matching the structure field)?
> 
> As to using eip (and naming the new structure field eip): Do you
> really mean only the low 32 bits of rip?
> 
> What if the first pass through this function is with guest EIP zero
> and gla also zero?
> 
> And finally, using the context save function seems pretty heavy
> for something that cares about exactly a single item in the
> structure. It would help if you explained in the description why
> no lighter weight alternative would work.

I should have added here: If the vCPU is paused, there surely
should be a cheaper way. If the vCPU is not paused, the value
read is stale by the time you use it.

Jan


___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v3] x86/mm: Suppresses vm_events caused by page-walks

2018-02-28 Thread Jan Beulich
>>> On 28.02.18 at 14:25,  wrote:
> --- a/xen/arch/x86/mm/mem_access.c
> +++ b/xen/arch/x86/mm/mem_access.c
> @@ -137,6 +137,23 @@ bool p2m_mem_access_emulate_check(struct vcpu *v,
>  return violation;
>  }
>  
> +static void p2m_set_ad_bits(struct vcpu *v, paddr_t ga)
> +{
> +struct hvm_hw_cpu ctxt;
> +uint32_t pfec = 0;
> +
> +hvm_funcs.save_cpu_ctxt(v, );
> +
> +if ( guest_cpu_user_regs()->eip == v->arch.pg_dirty.eip
> + && ga == v->arch.pg_dirty.gla )
> +pfec = PFEC_write_access;
> +
> +paging_ga_to_gfn_cr3(v, ctxt.cr3, ga, , NULL);
> +
> +v->arch.pg_dirty.eip = guest_cpu_user_regs()->eip;
> +v->arch.pg_dirty.gla = ga;
> +}

This being the only user of v->arch.pg_dirty, why is the new
sub-structure made part of struct arch_vcpu instead of
struct arch_vm_event (or some other sub-structure referenced
by pointer, such that struct arch_vcpu wouldn't grow in size?
And even without that, this is HVM-specific, so would at best
belong into the HVM sub-structure.

The PFEC_write_access logic is completely unclear to me, despite
the attempt to describe this in the description. I'm pretty sure you
want a code comment here.

Why is the function argument named ga instead of gla (thus
matching the structure field)?

As to using eip (and naming the new structure field eip): Do you
really mean only the low 32 bits of rip?

What if the first pass through this function is with guest EIP zero
and gla also zero?

And finally, using the context save function seems pretty heavy
for something that cares about exactly a single item in the
structure. It would help if you explained in the description why
no lighter weight alternative would work.

Jan


___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v3] x86/mm: Suppresses vm_events caused by page-walks

2018-02-28 Thread Alexandru Isaila
This patch is adding a way to enable/disable inguest pagefault
events. It introduces the xc_monitor_inguest_pagefault function
and adds the inguest_pagefault_disabled in the monitor structure.
This is needed by the introspection so it will only get gla
faults and not get spammed with other faults.
In p2m_set_ad_bits the v->arch.sse_pg_dirty.eip and
v->arch.sse_pg_dirty.gla are used to mark that this is the
second time a fault occurs and the dirty bit is set.

Signed-off-by: Alexandru Isaila 

---
Changes since V2:
- Renamed nested_pagefault to inguest_pagefault
- Add comment to inguest_pagefault_disabled
---
 tools/libxc/include/xenctrl.h |  2 ++
 tools/libxc/xc_monitor.c  | 14 ++
 xen/arch/x86/mm/mem_access.c  | 27 +++
 xen/arch/x86/monitor.c| 13 +
 xen/include/asm-x86/domain.h  | 10 ++
 xen/include/asm-x86/monitor.h |  3 ++-
 xen/include/public/domctl.h   |  2 ++
 7 files changed, 70 insertions(+), 1 deletion(-)

diff --git a/tools/libxc/include/xenctrl.h b/tools/libxc/include/xenctrl.h
index 09e1363..20c2813 100644
--- a/tools/libxc/include/xenctrl.h
+++ b/tools/libxc/include/xenctrl.h
@@ -2056,6 +2056,8 @@ int xc_monitor_descriptor_access(xc_interface *xch, 
uint32_t domain_id,
  bool enable);
 int xc_monitor_guest_request(xc_interface *xch, uint32_t domain_id,
  bool enable, bool sync, bool allow_userspace);
+int xc_monitor_inguest_pagefault(xc_interface *xch, uint32_t domain_id,
+ bool disable);
 int xc_monitor_debug_exceptions(xc_interface *xch, uint32_t domain_id,
 bool enable, bool sync);
 int xc_monitor_cpuid(xc_interface *xch, uint32_t domain_id, bool enable);
diff --git a/tools/libxc/xc_monitor.c b/tools/libxc/xc_monitor.c
index 0233b87..4ac823e 100644
--- a/tools/libxc/xc_monitor.c
+++ b/tools/libxc/xc_monitor.c
@@ -163,6 +163,20 @@ int xc_monitor_guest_request(xc_interface *xch, uint32_t 
domain_id, bool enable,
 return do_domctl(xch, );
 }
 
+int xc_monitor_inguest_pagefault(xc_interface *xch, uint32_t domain_id,
+bool disable)
+{
+DECLARE_DOMCTL;
+
+domctl.cmd = XEN_DOMCTL_monitor_op;
+domctl.domain = domain_id;
+domctl.u.monitor_op.op = disable ? XEN_DOMCTL_MONITOR_OP_ENABLE
+: XEN_DOMCTL_MONITOR_OP_DISABLE;
+domctl.u.monitor_op.event = XEN_DOMCTL_MONITOR_EVENT_INGUEST_PAGEFAULT;
+
+return do_domctl(xch, );
+}
+
 int xc_monitor_emulate_each_rep(xc_interface *xch, uint32_t domain_id,
 bool enable)
 {
diff --git a/xen/arch/x86/mm/mem_access.c b/xen/arch/x86/mm/mem_access.c
index c0cd017..057f345 100644
--- a/xen/arch/x86/mm/mem_access.c
+++ b/xen/arch/x86/mm/mem_access.c
@@ -137,6 +137,23 @@ bool p2m_mem_access_emulate_check(struct vcpu *v,
 return violation;
 }
 
+static void p2m_set_ad_bits(struct vcpu *v, paddr_t ga)
+{
+struct hvm_hw_cpu ctxt;
+uint32_t pfec = 0;
+
+hvm_funcs.save_cpu_ctxt(v, );
+
+if ( guest_cpu_user_regs()->eip == v->arch.pg_dirty.eip
+ && ga == v->arch.pg_dirty.gla )
+pfec = PFEC_write_access;
+
+paging_ga_to_gfn_cr3(v, ctxt.cr3, ga, , NULL);
+
+v->arch.pg_dirty.eip = guest_cpu_user_regs()->eip;
+v->arch.pg_dirty.gla = ga;
+}
+
 bool p2m_mem_access_check(paddr_t gpa, unsigned long gla,
   struct npfec npfec,
   vm_event_request_t **req_ptr)
@@ -208,6 +225,16 @@ bool p2m_mem_access_check(paddr_t gpa, unsigned long gla,
 }
 }
 
+if ( vm_event_check_ring(d->vm_event_monitor) &&
+ d->arch.monitor.inguest_pagefault_disabled &&
+ npfec.kind != npfec_kind_with_gla ) /* don't send a mem_event */
+{
+v->arch.vm_event->emulate_flags = 0;
+p2m_set_ad_bits(v, gla);
+
+return true;
+}
+
 *req_ptr = NULL;
 req = xzalloc(vm_event_request_t);
 if ( req )
diff --git a/xen/arch/x86/monitor.c b/xen/arch/x86/monitor.c
index f229e69..ce5d1ba 100644
--- a/xen/arch/x86/monitor.c
+++ b/xen/arch/x86/monitor.c
@@ -241,6 +241,19 @@ int arch_monitor_domctl_event(struct domain *d,
 break;
 }
 
+case XEN_DOMCTL_MONITOR_EVENT_INGUEST_PAGEFAULT:
+{
+bool old_status = ad->monitor.inguest_pagefault_disabled;
+
+if ( unlikely(old_status == requested_status) )
+return -EEXIST;
+
+domain_pause(d);
+ad->monitor.inguest_pagefault_disabled = requested_status;
+domain_unpause(d);
+break;
+}
+
 case XEN_DOMCTL_MONITOR_EVENT_DESC_ACCESS:
 {
 bool old_status = ad->monitor.descriptor_access_enabled;
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index 4679d54..37d04a2 100644
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -412,6 +412,11 @@