Re: [Xen-devel] [PATCH v3 5/5] x86: use PDEP for PTE flags insertion when available

2018-08-20 Thread Jan Beulich
>>> On 18.08.18 at 03:08,  wrote:
> On Aug 17, 2018, at 03:24, Jan Beulich  wrote:
>> 
>> This replaces 5 instructions by a single one, further reducing code size,
>> cache, and TLB footprint (in particular on systems supporting BMI2).
> 
> This link claims that BMI2 may be less performant/consistent on AMD Ryzen 
> than Intel:
> https://www.reddit.com/r/Amd/comments/60i6er/ryzen_and_bmi2_strange_behavior 
> _and_high_latencies/

Hmm, interesting. Brian - any word as to whether we'd better avoid
using PDEP/PEXT for now on AMD?

> Would this patch series have any benefit to L0 hypervisors/rootkits (e.g. 
> Bromium, Bareflank or similar hypervisors) which could be monitoring L1 Xen?  
> Or Xen as L0 hypervisor and Hyper-V as L1 hypervisor?

The insns aren't used on any secrets, so I don't see the connection.
But then again I'm not an expert here at all.

Jan



___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v3 5/5] x86: use PDEP for PTE flags insertion when available

2018-08-17 Thread Rich Persaud
On Aug 17, 2018, at 03:24, Jan Beulich  wrote:
> 
> This replaces 5 instructions by a single one, further reducing code size,
> cache, and TLB footprint (in particular on systems supporting BMI2).

This link claims that BMI2 may be less performant/consistent on AMD Ryzen than 
Intel:
https://www.reddit.com/r/Amd/comments/60i6er/ryzen_and_bmi2_strange_behavior_and_high_latencies/

Would this patch series have any benefit to L0 hypervisors/rootkits (e.g. 
Bromium, Bareflank or similar hypervisors) which could be monitoring L1 Xen?  
Or Xen as L0 hypervisor and Hyper-V as L1 hypervisor?

Rich___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v3 5/5] x86: use PDEP for PTE flags insertion when available

2018-08-17 Thread Jan Beulich
This replaces 5 instructions by a single one, further reducing code size,
cache, and TLB footprint (in particular on systems supporting BMI2).

Signed-off-by: Jan Beulich 
---
Irrespective of the note regarding a possible alternative route, I think
the change here is an improvement until someone would take on
investigating whether the proposed dropping of the 24-bit flags
representation would be a worthwhile improvement of generated code.
---
v3: Move this patch last in series. Re-base.
v2: Avoid quoted symbols; use gcc's new V operand modifier instead.
Re-base.
---
TBD: Also change get_pte_flags() (after having introduced test_pte_flags())?

--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -393,6 +393,8 @@ void __init arch_init_memory(void)
 paddr_t __read_mostly ma_real_mask = ~0UL;
 unsigned long __read_mostly pfn_real_mask = ~0UL;
 
+const intpte_t pte_flags_mask = ~(PADDR_MASK & PAGE_MASK);
+
 #ifndef CONFIG_INDIRECT_THUNK /* V modifier unavailable? */
 
 /* Conversion between machine address and direct map offset. */
@@ -419,6 +421,11 @@ unsigned long pfn2pdx(unsigned long pfn)
 return generic_pfn_to_pdx(pfn);
 }
 
+intpte_t put_pte_flags_v(unsigned int flags)
+{
+return put_pte_flags_c(flags);
+}
+
 #endif
 
 int page_is_ram_type(unsigned long mfn, unsigned long mem_type)
--- a/xen/include/asm-x86/x86_64/page.h
+++ b/xen/include/asm-x86/x86_64/page.h
@@ -205,15 +205,53 @@ typedef l4_pgentry_t root_pgentry_t;
 
 /* Extract flags into 24-bit integer, or turn 24-bit flags into a pte mask. */
 #ifndef __ASSEMBLY__
+extern const intpte_t pte_flags_mask;
+intpte_t __attribute_const__ put_pte_flags_v(unsigned int x);
+
 static inline unsigned int get_pte_flags(intpte_t x)
 {
 return ((x >> 40) & ~0xfff) | (x & 0xfff);
 }
 
-static inline intpte_t put_pte_flags(unsigned int x)
+static inline intpte_t put_pte_flags_c(unsigned int x)
 {
 return (((intpte_t)x & ~0xfff) << 40) | (x & 0xfff);
 }
+
+static always_inline intpte_t put_pte_flags(unsigned int x)
+{
+intpte_t pte;
+
+if ( __builtin_constant_p(x) )
+return put_pte_flags_c(x);
+
+#ifdef CONFIG_INDIRECT_THUNK /* V modifier available? */
+#define SYMNAME(pfx...) #pfx "put_pte_flags_%V[pte]_%V[flags]"
+alternative_io("call " SYMNAME() "\n\t"
+   LINKONCE_PROLOGUE(SYMNAME) "\n\t"
+   "mov %[flags], %k[pte]\n\t"
+   "and $0xfff000, %[flags]\n\t"
+   "and $0x000fff, %k[pte]\n\t"
+   "shl $40, %q[flags]\n\t"
+   "or %q[flags], %[pte]\n\t"
+   "ret\n\t"
+   LINKONCE_EPILOGUE(SYMNAME),
+   "pdep %[mask], %q[flags], %[pte]", X86_FEATURE_BMI2,
+   ASM_OUTPUT2([pte] "=" (pte), [flags] "+r" (x)),
+   [mask] "m" (pte_flags_mask));
+#undef SYMNAME
+#else
+alternative_io("call put_pte_flags_v",
+   /* pdep pte_flags_mask(%rip), %rdi, %rax */
+   ".byte 0xc4, 0xe2, 0xc3, 0xf5, 0x05\n\t"
+   ".long pte_flags_mask - 4 - .",
+   X86_FEATURE_BMI2,
+   ASM_OUTPUT2("=a" (pte), "+D" (x)), "m" (pte_flags_mask)
+   : "rcx", "rdx", "rsi", "r8", "r9", "r10", "r11");
+#endif
+
+return pte;
+}
 #endif
 
 /*





___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel