Re: [Xen-devel] [PATCH v5 5/7] xen/x86: disable global pages for domains with XPTI active

2018-04-06 Thread Juergen Gross
On 06/04/18 17:17, Andrew Cooper wrote:
> On 06/04/18 08:52, Juergen Gross wrote:
>> Instead of flushing the TLB from global pages when switching address
>> spaces with XPTI being active just disable global pages via %cr4
>> completely when a domain subject to XPTI is active. This avoids the
>> need for extra TLB flushes as loading %cr3 will remove all TLB
>> entries.
>>
>> In order to avoid states with cr3/cr4 having inconsistent values
>> (e.g. global pages being activated while cr3 already specifies a XPTI
>> address space) move loading of the new cr4 value to write_ptbase()
>> (actually to write_cr3_cr4() called by write_ptbase()).
>>
>> Signed-off-by: Juergen Gross 
>> Reviewed-by: Jan Beulich 
> 
> This is contrary to the performance optimisations taken by Linux,
> Windows and Apple, which borrowing Xen's 64bit PV optimisation of having
> global user pages, because it really is a (small) performance improvement.
> 
> Are there performance numbers for this change alone, and/or are later
> changes strictly dependent on this functionality?

Yes and yes.

Performance is much better for XPTI (again my standard compile test):
elapsed time dropped from 112 to 106 seconds, system time from 160 to
139 seconds, user time from 187 to 186 seconds.

Page invalidation with PCID enabled is _much_ easier.

> On the Xen side of things, an argument could probably be made that the
> extra cr4 rewrites due to the L4 shadowing might eat away the
> performance we would otherwise gain, but I'd be hesitant to blindly
> assume that this is the case.

Another problem I wanted to avoid was the global page handling of Xen
private pages: I would have needed to remove all the global bits, either
even for AMD cpus, or do that dynamically.

> A complicating factor is that Intel have said that the performance gains
> from user global pages would be more noticeable on older hardware, due
> to differences in the TLB architecture.
> 
>> ---
>> V4:
>> - don't use mmu_cr4_features for setting new cr4 value (Jan Beulich)
>> - use simpler scheme for setting X86_CR4_PGE in
>>   pv_guest_cr4_to_real_cr4() (Jan Beulich)
>>
>> V3:
>> - move cr4 loading for all domains from *_ctxt_switch_to() to
>>   write_cr3_cr4() called by write_ptbase() (Jan Beulich)
>> - rebase
>> ---
>>  xen/arch/x86/domain.c  |  5 -
>>  xen/arch/x86/flushtlb.c| 13 -
>>  xen/arch/x86/mm.c  | 14 +++---
>>  xen/arch/x86/x86_64/entry.S| 10 --
>>  xen/common/efi/runtime.c   |  4 ++--
>>  xen/include/asm-x86/domain.h   |  3 ++-
>>  xen/include/asm-x86/flushtlb.h |  2 +-
>>  7 files changed, 24 insertions(+), 27 deletions(-)
>>
>> diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
>> index 9c229594f4..c2bb70c483 100644
>> --- a/xen/arch/x86/domain.c
>> +++ b/xen/arch/x86/domain.c
>> @@ -1522,17 +1522,12 @@ void paravirt_ctxt_switch_from(struct vcpu *v)
>>  void paravirt_ctxt_switch_to(struct vcpu *v)
>>  {
>>  root_pgentry_t *root_pgt = this_cpu(root_pgt);
>> -unsigned long cr4;
>>  
>>  if ( root_pgt )
>>  root_pgt[root_table_offset(PERDOMAIN_VIRT_START)] =
>>  l4e_from_page(v->domain->arch.perdomain_l3_pg,
>>__PAGE_HYPERVISOR_RW);
>>  
>> -cr4 = pv_guest_cr4_to_real_cr4(v);
>> -if ( unlikely(cr4 != read_cr4()) )
>> -write_cr4(cr4);
>> -
>>  if ( unlikely(v->arch.debugreg[7] & DR7_ACTIVE_MASK) )
>>  activate_debugregs(v);
>>  
>> diff --git a/xen/arch/x86/flushtlb.c b/xen/arch/x86/flushtlb.c
>> index f793b70696..5dcd9a2bf6 100644
>> --- a/xen/arch/x86/flushtlb.c
>> +++ b/xen/arch/x86/flushtlb.c
>> @@ -89,20 +89,23 @@ static void do_tlb_flush(void)
>>  post_flush(t);
>>  }
>>  
>> -void write_cr3(unsigned long cr3)
>> +void write_cr3_cr4(unsigned long cr3, unsigned long cr4)
>>  {
>> -unsigned long flags, cr4;
>> +unsigned long flags;
>>  u32 t;
>>  
>>  /* This non-reentrant function is sometimes called in interrupt 
>> context. */
>>  local_irq_save(flags);
>>  
>>  t = pre_flush();
>> -cr4 = read_cr4();
>>  
>> -write_cr4(cr4 & ~X86_CR4_PGE);
>> +if ( read_cr4() & X86_CR4_PGE )
>> +write_cr4(cr4 & ~X86_CR4_PGE);
>> +
>>  asm volatile ( "mov %0, %%cr3" : : "r" (cr3) : "memory" );
>> -write_cr4(cr4);
>> +
>> +if ( read_cr4() != cr4 )
> 
> read_cr4(), despite being a cached read, isn't free because of the %rsp
> manipulation required to access the variable.  I'd keep the locally
> cached cr4, and use "if ( cr4 & X86_CR4_PGE )" here.

I did this on purpose: it might be cr4 is being modified not only
regarding pge. I can nevertheless cache the read cr4 value, of course.


Juergen

___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v5 5/7] xen/x86: disable global pages for domains with XPTI active

2018-04-06 Thread Andrew Cooper
On 06/04/18 08:52, Juergen Gross wrote:
> Instead of flushing the TLB from global pages when switching address
> spaces with XPTI being active just disable global pages via %cr4
> completely when a domain subject to XPTI is active. This avoids the
> need for extra TLB flushes as loading %cr3 will remove all TLB
> entries.
>
> In order to avoid states with cr3/cr4 having inconsistent values
> (e.g. global pages being activated while cr3 already specifies a XPTI
> address space) move loading of the new cr4 value to write_ptbase()
> (actually to write_cr3_cr4() called by write_ptbase()).
>
> Signed-off-by: Juergen Gross 
> Reviewed-by: Jan Beulich 

This is contrary to the performance optimisations taken by Linux,
Windows and Apple, which borrowing Xen's 64bit PV optimisation of having
global user pages, because it really is a (small) performance improvement.

Are there performance numbers for this change alone, and/or are later
changes strictly dependent on this functionality?

On the Xen side of things, an argument could probably be made that the
extra cr4 rewrites due to the L4 shadowing might eat away the
performance we would otherwise gain, but I'd be hesitant to blindly
assume that this is the case.

A complicating factor is that Intel have said that the performance gains
from user global pages would be more noticeable on older hardware, due
to differences in the TLB architecture.

> ---
> V4:
> - don't use mmu_cr4_features for setting new cr4 value (Jan Beulich)
> - use simpler scheme for setting X86_CR4_PGE in
>   pv_guest_cr4_to_real_cr4() (Jan Beulich)
>
> V3:
> - move cr4 loading for all domains from *_ctxt_switch_to() to
>   write_cr3_cr4() called by write_ptbase() (Jan Beulich)
> - rebase
> ---
>  xen/arch/x86/domain.c  |  5 -
>  xen/arch/x86/flushtlb.c| 13 -
>  xen/arch/x86/mm.c  | 14 +++---
>  xen/arch/x86/x86_64/entry.S| 10 --
>  xen/common/efi/runtime.c   |  4 ++--
>  xen/include/asm-x86/domain.h   |  3 ++-
>  xen/include/asm-x86/flushtlb.h |  2 +-
>  7 files changed, 24 insertions(+), 27 deletions(-)
>
> diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
> index 9c229594f4..c2bb70c483 100644
> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -1522,17 +1522,12 @@ void paravirt_ctxt_switch_from(struct vcpu *v)
>  void paravirt_ctxt_switch_to(struct vcpu *v)
>  {
>  root_pgentry_t *root_pgt = this_cpu(root_pgt);
> -unsigned long cr4;
>  
>  if ( root_pgt )
>  root_pgt[root_table_offset(PERDOMAIN_VIRT_START)] =
>  l4e_from_page(v->domain->arch.perdomain_l3_pg,
>__PAGE_HYPERVISOR_RW);
>  
> -cr4 = pv_guest_cr4_to_real_cr4(v);
> -if ( unlikely(cr4 != read_cr4()) )
> -write_cr4(cr4);
> -
>  if ( unlikely(v->arch.debugreg[7] & DR7_ACTIVE_MASK) )
>  activate_debugregs(v);
>  
> diff --git a/xen/arch/x86/flushtlb.c b/xen/arch/x86/flushtlb.c
> index f793b70696..5dcd9a2bf6 100644
> --- a/xen/arch/x86/flushtlb.c
> +++ b/xen/arch/x86/flushtlb.c
> @@ -89,20 +89,23 @@ static void do_tlb_flush(void)
>  post_flush(t);
>  }
>  
> -void write_cr3(unsigned long cr3)
> +void write_cr3_cr4(unsigned long cr3, unsigned long cr4)
>  {
> -unsigned long flags, cr4;
> +unsigned long flags;
>  u32 t;
>  
>  /* This non-reentrant function is sometimes called in interrupt context. 
> */
>  local_irq_save(flags);
>  
>  t = pre_flush();
> -cr4 = read_cr4();
>  
> -write_cr4(cr4 & ~X86_CR4_PGE);
> +if ( read_cr4() & X86_CR4_PGE )
> +write_cr4(cr4 & ~X86_CR4_PGE);
> +
>  asm volatile ( "mov %0, %%cr3" : : "r" (cr3) : "memory" );
> -write_cr4(cr4);
> +
> +if ( read_cr4() != cr4 )

read_cr4(), despite being a cached read, isn't free because of the %rsp
manipulation required to access the variable.  I'd keep the locally
cached cr4, and use "if ( cr4 & X86_CR4_PGE )" here.

~Andrew

___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v5 5/7] xen/x86: disable global pages for domains with XPTI active

2018-04-06 Thread Juergen Gross
Instead of flushing the TLB from global pages when switching address
spaces with XPTI being active just disable global pages via %cr4
completely when a domain subject to XPTI is active. This avoids the
need for extra TLB flushes as loading %cr3 will remove all TLB
entries.

In order to avoid states with cr3/cr4 having inconsistent values
(e.g. global pages being activated while cr3 already specifies a XPTI
address space) move loading of the new cr4 value to write_ptbase()
(actually to write_cr3_cr4() called by write_ptbase()).

Signed-off-by: Juergen Gross 
Reviewed-by: Jan Beulich 
---
V4:
- don't use mmu_cr4_features for setting new cr4 value (Jan Beulich)
- use simpler scheme for setting X86_CR4_PGE in
  pv_guest_cr4_to_real_cr4() (Jan Beulich)

V3:
- move cr4 loading for all domains from *_ctxt_switch_to() to
  write_cr3_cr4() called by write_ptbase() (Jan Beulich)
- rebase
---
 xen/arch/x86/domain.c  |  5 -
 xen/arch/x86/flushtlb.c| 13 -
 xen/arch/x86/mm.c  | 14 +++---
 xen/arch/x86/x86_64/entry.S| 10 --
 xen/common/efi/runtime.c   |  4 ++--
 xen/include/asm-x86/domain.h   |  3 ++-
 xen/include/asm-x86/flushtlb.h |  2 +-
 7 files changed, 24 insertions(+), 27 deletions(-)

diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index 9c229594f4..c2bb70c483 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1522,17 +1522,12 @@ void paravirt_ctxt_switch_from(struct vcpu *v)
 void paravirt_ctxt_switch_to(struct vcpu *v)
 {
 root_pgentry_t *root_pgt = this_cpu(root_pgt);
-unsigned long cr4;
 
 if ( root_pgt )
 root_pgt[root_table_offset(PERDOMAIN_VIRT_START)] =
 l4e_from_page(v->domain->arch.perdomain_l3_pg,
   __PAGE_HYPERVISOR_RW);
 
-cr4 = pv_guest_cr4_to_real_cr4(v);
-if ( unlikely(cr4 != read_cr4()) )
-write_cr4(cr4);
-
 if ( unlikely(v->arch.debugreg[7] & DR7_ACTIVE_MASK) )
 activate_debugregs(v);
 
diff --git a/xen/arch/x86/flushtlb.c b/xen/arch/x86/flushtlb.c
index f793b70696..5dcd9a2bf6 100644
--- a/xen/arch/x86/flushtlb.c
+++ b/xen/arch/x86/flushtlb.c
@@ -89,20 +89,23 @@ static void do_tlb_flush(void)
 post_flush(t);
 }
 
-void write_cr3(unsigned long cr3)
+void write_cr3_cr4(unsigned long cr3, unsigned long cr4)
 {
-unsigned long flags, cr4;
+unsigned long flags;
 u32 t;
 
 /* This non-reentrant function is sometimes called in interrupt context. */
 local_irq_save(flags);
 
 t = pre_flush();
-cr4 = read_cr4();
 
-write_cr4(cr4 & ~X86_CR4_PGE);
+if ( read_cr4() & X86_CR4_PGE )
+write_cr4(cr4 & ~X86_CR4_PGE);
+
 asm volatile ( "mov %0, %%cr3" : : "r" (cr3) : "memory" );
-write_cr4(cr4);
+
+if ( read_cr4() != cr4 )
+write_cr4(cr4);
 
 post_flush(t);
 
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 131433af9b..55c437751f 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -506,20 +506,28 @@ void make_cr3(struct vcpu *v, mfn_t mfn)
 void write_ptbase(struct vcpu *v)
 {
 struct cpu_info *cpu_info = get_cpu_info();
+unsigned long new_cr4;
+
+new_cr4 = (is_pv_vcpu(v) && !is_idle_vcpu(v))
+  ? pv_guest_cr4_to_real_cr4(v)
+  : ((read_cr4() & ~X86_CR4_TSD) | X86_CR4_PGE);
 
 if ( is_pv_vcpu(v) && v->domain->arch.pv_domain.xpti )
 {
 cpu_info->root_pgt_changed = true;
 cpu_info->pv_cr3 = __pa(this_cpu(root_pgt));
-asm volatile ( "mov %0, %%cr3" : : "r" (v->arch.cr3) : "memory" );
+write_cr3_cr4(v->arch.cr3, new_cr4);
 }
 else
 {
-/* Make sure to clear xen_cr3 before pv_cr3; write_cr3() serializes. */
+/* Make sure to clear xen_cr3 before pv_cr3. */
 cpu_info->xen_cr3 = 0;
-write_cr3(v->arch.cr3);
+/* write_cr3_cr4() serializes. */
+write_cr3_cr4(v->arch.cr3, new_cr4);
 cpu_info->pv_cr3 = 0;
 }
+
+ASSERT(is_pv_vcpu(v) || read_cr4() == mmu_cr4_features);
 }
 
 /*
diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
index 5026cfa490..6298ed65cc 100644
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -154,13 +154,8 @@ restore_all_guest:
 ROOT_PAGETABLE_LAST_XEN_SLOT - 1) * 8, %rdi
 rep movsq
 .Lrag_copy_done:
-mov   STACK_CPUINFO_FIELD(cr4)(%rdx), %rdi
 mov   %r9, STACK_CPUINFO_FIELD(xen_cr3)(%rdx)
-mov   %rdi, %rsi
-and   $~X86_CR4_PGE, %rdi
-mov   %rdi, %cr4
 mov   %rax, %cr3
-mov   %rsi, %cr4
 .Lrag_keep_cr3:
 
 /* Restore stashed SPEC_CTRL value. */
@@ -216,12 +211,7 @@ restore_all_xen:
  * so "g" will have to do.
  */
 UNLIKELY_START(g, exit_cr3)
-mov   %cr4, %rdi
-mov   %rdi, %rsi
-and   $~X86_CR4_PGE, %rdi
-mov   %rdi, %cr4
 mov   %rax, %cr3
-mov   %rsi, %cr4