Re: [Xen-devel] Getting rid of inside_vm in intel8x0

2016-04-02 Thread Andy Lutomirski
properties of hardware.  Just off the top of my head, here are
some types of VM and what they might imply about hardware:

Intel Kernel Guard: your sound card is passed through from real hardware.

Xen: could go either way.  In dom0, it's likely passed through.  In
domU, it could be passed through or emulated, and I believe this is
the case for all of the Xen variants.

KVM: Probably emulated, but could be passed through.

I think the main reason that Luis and I are both uncomfortable with
"am I in a VM" checks is that they're rarely the right thing to be
detecting, the APIs are poorly designed, and most of the use cases in
the kernel are using them as a proxy for something else and would be
clearer and more future proof if they tested what they actually need
to test more directly.

>
>> > You may ask whether we can reduce the whole workaround instead.  It's
>> > practically impossible.  We don't know which models doing so and which
>> > not.  And, the hardware in question are (literally) thousands of
>> > variants of damn old PC mobos.  Any fundamental change needs to be
>> > verified on all these machines...
>>
>> What if we can come up with algorithm on the ring buffer that would
>> satisfy both cases without special casing it ? Is removing this VM
>> check impossible really?
>
> Yes, it's impossible practically, see my comment above.
> Whatever you change, you need to verify it on real machines.  And it's
> very difficult to achieve.

But, given what I think you're saying, you only need to test one way:
if the non-VM code works and is just slow on a VM, then wouldn't it be
okay if there were some heuristic that were always right on bare metal
and mostly right on a VM?

Anyway, I still don't see what's wrong with just measuring how long an
iteration of your loop takes.  Sure, on both bare metal and on a VM,
there are all kinds of timing errors due to SMI and such, but I don't
think it's true at all that hypervisors will show you only guest time.
The sound drivers don't run early in boot -- they run when full kernel
functionality is available.  Both the ktime_* APIs and
CLOCK_MONTONIC_RAW should give actual physical elapsed time.  After
all, if they didn't, then simply reading the clock in a VM guest would
be completely broken.

In other words, a simple heuristic could be that, if each of the first
four iterations takes >100 microseconds (or whatever the actual number
is that starts causing real problems on a VM), then switch to the VM
variant.  After all, if you run on native hardware that's so slow that
your loop will just time out, then you don't gain anything by actually
letting it time out, and, if you're on a VM that's so fast that it
doesn't matter, then it shouldn't matter what you do.

--Andy


-- 
Andy Lutomirski
AMA Capital Management, LLC

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH v5 3/9] x86/head: Move early exception panic code into early_fixup_exception

2016-04-02 Thread Andy Lutomirski
This removes a bunch of assembly and adds some C code instead.  It
changes the actual printouts on both 32-bit and 64-bit kernels, but
they still seem okay.

Signed-off-by: Andy Lutomirski 
---
 arch/x86/include/asm/uaccess.h |  2 +-
 arch/x86/kernel/head_32.S  | 49 +-
 arch/x86/kernel/head_64.S  | 45 ++
 arch/x86/mm/extable.c  | 29 -
 4 files changed, 32 insertions(+), 93 deletions(-)

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index a3afb7259751..83fd2cf187d2 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -111,7 +111,7 @@ struct exception_table_entry {
 
 extern int fixup_exception(struct pt_regs *regs, int trapnr);
 extern bool ex_has_fault_handler(unsigned long ip);
-extern int early_fixup_exception(struct pt_regs *regs, int trapnr);
+extern void early_fixup_exception(struct pt_regs *regs, int trapnr);
 
 /*
  * These are the main single-value transfer routines.  They automatically
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 5e6ce845813a..411dce93fee9 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -555,8 +555,6 @@ early_idt_handler_common:
 */
cld
 
-   cmpl $2,%ss:early_recursion_flag
-   je hlt_loop
incl %ss:early_recursion_flag
 
/* The vector number is in pt_regs->gs */
@@ -588,13 +586,8 @@ early_idt_handler_common:
movw%gs, PT_GS(%esp)
movw$0, PT_GS+2(%esp)
 
-   cmpl $(__KERNEL_CS),PT_CS(%esp)
-   jne 10f
-
movl%esp, %eax  /* args are pt_regs (EAX), trapnr (EDX) */
callearly_fixup_exception
-   andl%eax,%eax
-   jz  10f /* Exception wasn't fixed up */
 
popl%ebx/* pt_regs->bx */
popl%ecx/* pt_regs->cx */
@@ -610,29 +603,6 @@ early_idt_handler_common:
decl%ss:early_recursion_flag
addl$4, %esp/* pop pt_regs->orig_ax */
iret
-
-10:
-#ifdef CONFIG_PRINTK
-   xorl %eax,%eax
-   movw %ax,PT_FS+2(%esp)  /* clean up the segment values on some cpus */
-   movw %ax,PT_DS+2(%esp)
-   movw %ax,PT_ES+2(%esp)
-   leal  40(%esp),%eax
-   pushl %eax  /* %esp before the exception */
-   pushl %ebx
-   pushl %ebp
-   pushl %esi
-   pushl %edi
-   movl %cr2,%eax
-   pushl %eax
-   pushl (20+6*4)(%esp)/* trapno */
-   pushl $fault_msg
-   call printk
-#endif
-   call dump_stack
-hlt_loop:
-   hlt
-   jmp hlt_loop
 ENDPROC(early_idt_handler_common)
 
 /* This is the default interrupt "handler" :-) */
@@ -668,10 +638,14 @@ ignore_int:
popl %eax
 #endif
iret
+
+hlt_loop:
+   hlt
+   jmp hlt_loop
 ENDPROC(ignore_int)
 __INITDATA
.align 4
-early_recursion_flag:
+GLOBAL(early_recursion_flag)
.long 0
 
 __REFDATA
@@ -736,19 +710,6 @@ __INITRODATA
 int_msg:
.asciz "Unknown interrupt or fault at: %p %p %p\n"
 
-fault_msg:
-/* fault info: */
-   .ascii "BUG: Int %d: CR2 %p\n"
-/* regs pushed in early_idt_handler: */
-   .ascii " EDI %p  ESI %p  EBP %p  EBX %p\n"
-   .ascii " ESP %p   ES %p   DS %p\n"
-   .ascii " EDX %p  ECX %p  EAX %p\n"
-/* fault frame: */
-   .ascii " vec %p  err %p  EIP %p   CS %p  flg %p\n"
-   .ascii "Stack: %p %p %p %p %p %p %p %p\n"
-   .ascii "   %p %p %p %p %p %p %p %p\n"
-   .asciz "   %p %p %p %p %p %p %p %p\n"
-
 #include "../../x86/xen/xen-head.S"
 
 /*
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index af87896b6a23..c39b6437cf03 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -354,8 +354,6 @@ early_idt_handler_common:
 */
cld
 
-   cmpl $2,early_recursion_flag(%rip)
-   jz  1f
incl early_recursion_flag(%rip)
 
/* The vector number is currently in the pt_regs->di slot. */
@@ -376,9 +374,6 @@ early_idt_handler_common:
pushq %r14  /* pt_regs->r14 */
pushq %r15  /* pt_regs->r15 */
 
-   cmpl $__KERNEL_CS,CS(%rsp)
-   jne 11f
-
cmpq $14,%rsi   /* Page fault? */
jnz 10f
GET_CR2_INTO(%rdi)  /* Can clobber any volatile register if pv */
@@ -389,37 +384,8 @@ early_idt_handler_common:
 10:
movq %rsp,%rdi  /* RDI = pt_regs; RSI is already trapnr */
call early_fixup_exception
-   andl %eax,%eax
-   jnz 20f # Found an exception entry
-
-11:
-#ifdef CONFIG_EARLY_PRINTK
-   /*
-* On paravirt kernels, GET_CR2_INTO clobbers callee-clobbered regs.
-* We only care about

[Xen-devel] [PATCH v5 8/9] x86/paravirt: Make "unsafe" MSR accesses unsafe even if PARAVIRT=y

2016-04-02 Thread Andy Lutomirski
Enabling CONFIG_PARAVIRT had an unintended side effect: rdmsr turned
into rdmsr_safe and wrmsr turned into wrmsr_safe, even on bare
metal.  Undo that by using the new unsafe paravirt MSR hooks.

Signed-off-by: Andy Lutomirski 
---
 arch/x86/include/asm/paravirt.h | 9 +++--
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 68297d87e85c..0c99f10874e4 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -151,24 +151,21 @@ static inline int paravirt_write_msr_safe(unsigned msr,
return PVOP_CALL3(int, pv_cpu_ops.write_msr_safe, msr, low, high);
 }
 
-/* These should all do BUG_ON(_err), but our headers are too tangled. */
 #define rdmsr(msr, val1, val2) \
 do {   \
-   int _err;   \
-   u64 _l = paravirt_read_msr_safe(msr, &_err);\
+   u64 _l = paravirt_read_msr(msr);\
val1 = (u32)_l; \
val2 = _l >> 32;\
 } while (0)
 
 #define wrmsr(msr, val1, val2) \
 do {   \
-   paravirt_write_msr_safe(msr, val1, val2);   \
+   paravirt_write_msr(msr, val1, val2);\
 } while (0)
 
 #define rdmsrl(msr, val)   \
 do {   \
-   int _err;   \
-   val = paravirt_read_msr_safe(msr, &_err);   \
+   val = paravirt_read_msr(msr);   \
 } while (0)
 
 static inline void wrmsrl(unsigned msr, u64 val)
-- 
2.5.5


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH v5 9/9] x86/msr: Set the return value to zero when native_rdmsr_safe fails

2016-04-02 Thread Andy Lutomirski
This will cause unchecked native_rdmsr_safe failures to return
deterministic results.

Signed-off-by: Andy Lutomirski 
---
 arch/x86/include/asm/msr.h | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 13da359881d7..e97e79f8a22b 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -109,7 +109,10 @@ static inline unsigned long long 
native_read_msr_safe(unsigned int msr,
asm volatile("2: rdmsr ; xor %[err],%[err]\n"
 "1:\n\t"
 ".section .fixup,\"ax\"\n\t"
-"3:  mov %[fault],%[err] ; jmp 1b\n\t"
+"3: mov %[fault],%[err]\n\t"
+"xorl %%eax, %%eax\n\t"
+"xorl %%edx, %%edx\n\t"
+"jmp 1b\n\t"
 ".previous\n\t"
 _ASM_EXTABLE(2b, 3b)
 : [err] "=r" (*err), EAX_EDX_RET(val, low, high)
-- 
2.5.5


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH v5 5/9] x86/paravirt: Add _safe to the read_msr and write_msr PV hooks

2016-04-02 Thread Andy Lutomirski
These hooks match the _safe variants, so name them accordingly.
This will make room for unsafe PV hooks.

Signed-off-by: Andy Lutomirski 
---
 arch/x86/include/asm/paravirt.h   | 33 +
 arch/x86/include/asm/paravirt_types.h |  8 
 arch/x86/kernel/paravirt.c|  4 ++--
 arch/x86/xen/enlighten.c  |  4 ++--
 4 files changed, 25 insertions(+), 24 deletions(-)

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index f6192502149e..2e49228ed9a3 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -129,34 +129,35 @@ static inline void wbinvd(void)
 
 #define get_kernel_rpl()  (pv_info.kernel_rpl)
 
-static inline u64 paravirt_read_msr(unsigned msr, int *err)
+static inline u64 paravirt_read_msr_safe(unsigned msr, int *err)
 {
-   return PVOP_CALL2(u64, pv_cpu_ops.read_msr, msr, err);
+   return PVOP_CALL2(u64, pv_cpu_ops.read_msr_safe, msr, err);
 }
 
-static inline int paravirt_write_msr(unsigned msr, unsigned low, unsigned high)
+static inline int paravirt_write_msr_safe(unsigned msr,
+ unsigned low, unsigned high)
 {
-   return PVOP_CALL3(int, pv_cpu_ops.write_msr, msr, low, high);
+   return PVOP_CALL3(int, pv_cpu_ops.write_msr_safe, msr, low, high);
 }
 
 /* These should all do BUG_ON(_err), but our headers are too tangled. */
 #define rdmsr(msr, val1, val2) \
 do {   \
int _err;   \
-   u64 _l = paravirt_read_msr(msr, &_err); \
+   u64 _l = paravirt_read_msr_safe(msr, &_err);\
val1 = (u32)_l; \
val2 = _l >> 32;\
 } while (0)
 
 #define wrmsr(msr, val1, val2) \
 do {   \
-   paravirt_write_msr(msr, val1, val2);\
+   paravirt_write_msr_safe(msr, val1, val2);   \
 } while (0)
 
 #define rdmsrl(msr, val)   \
 do {   \
int _err;   \
-   val = paravirt_read_msr(msr, &_err);\
+   val = paravirt_read_msr_safe(msr, &_err);   \
 } while (0)
 
 static inline void wrmsrl(unsigned msr, u64 val)
@@ -164,23 +165,23 @@ static inline void wrmsrl(unsigned msr, u64 val)
wrmsr(msr, (u32)val, (u32)(val>>32));
 }
 
-#define wrmsr_safe(msr, a, b)  paravirt_write_msr(msr, a, b)
+#define wrmsr_safe(msr, a, b)  paravirt_write_msr_safe(msr, a, b)
 
 /* rdmsr with exception handling */
-#define rdmsr_safe(msr, a, b)  \
-({ \
-   int _err;   \
-   u64 _l = paravirt_read_msr(msr, &_err); \
-   (*a) = (u32)_l; \
-   (*b) = _l >> 32;\
-   _err;   \
+#define rdmsr_safe(msr, a, b)  \
+({ \
+   int _err;   \
+   u64 _l = paravirt_read_msr_safe(msr, &_err);\
+   (*a) = (u32)_l; \
+   (*b) = _l >> 32;\
+   _err;   \
 })
 
 static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
 {
int err;
 
-   *p = paravirt_read_msr(msr, &err);
+   *p = paravirt_read_msr_safe(msr, &err);
return err;
 }
 
diff --git a/arch/x86/include/asm/paravirt_types.h 
b/arch/x86/include/asm/paravirt_types.h
index 77db5616a473..5a06cccd36f0 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -155,10 +155,10 @@ struct pv_cpu_ops {
void (*cpuid)(unsigned int *eax, unsigned int *ebx,
  unsigned int *ecx, unsigned int *edx);
 
-   /* MSR, PMC and TSR operations.
-  err = 0/-EFAULT.  wrmsr returns 0/-EFAULT. */
-   u64 (*read_msr)(unsigned int msr, int *err);
-   int (*write_msr)(unsigned int msr, unsigned low, unsigned high);
+   /* MSR operations.
+  err = 0/-EIO.  wrmsr returns 0/-EIO. */
+   u64 (*read_msr_safe)(unsigned int msr, int *err);
+   int (*write_msr_safe)(unsigned int msr, unsigned low, unsigned high);
 
u64 (*read_pmc)(int counter);
 
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index f08ac28b8136..8aad95478ae5 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -339,8 +339,8 @@ __visible struct pv_cpu_ops pv_cpu_ops = {
.write_cr8 = native_write_cr8,
 #endif
.wbinvd = native_wbinvd,
-   .read_msr = native_read_msr_safe,
-   .write_msr = native_write_msr_safe,
+   .read_msr_safe = native_read_msr_safe,
+   .w

[Xen-devel] [PATCH v5 4/9] x86/traps: Enable all exception handler callbacks early

2016-04-02 Thread Andy Lutomirski
Now that early_fixup_exception has pt_regs, we can just call
fixup_exception from it.  This will make fancy exception handlers
work early.

Signed-off-by: Andy Lutomirski 
---
 arch/x86/mm/extable.c | 19 ++-
 1 file changed, 2 insertions(+), 17 deletions(-)

diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 8997022abebc..50dfe438bd91 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -95,10 +95,6 @@ extern unsigned int early_recursion_flag;
 /* Restricted version used during very early boot */
 void __init early_fixup_exception(struct pt_regs *regs, int trapnr)
 {
-   const struct exception_table_entry *e;
-   unsigned long new_ip;
-   ex_handler_t handler;
-
/* Ignore early NMIs. */
if (trapnr == X86_TRAP_NMI)
return;
@@ -109,19 +105,8 @@ void __init early_fixup_exception(struct pt_regs *regs, 
int trapnr)
if (regs->cs != __KERNEL_CS)
goto fail;
 
-   e = search_exception_tables(regs->ip);
-   if (!e)
-   goto fail;
-
-   new_ip  = ex_fixup_addr(e);
-   handler = ex_fixup_handler(e);
-
-   /* special handling not supported during early boot */
-   if (handler != ex_handler_default)
-   goto fail;
-
-   regs->ip = new_ip;
-   return;
+   if (fixup_exception(regs, trapnr))
+   return;
 
 fail:
early_printk("PANIC: early exception 0x%02x IP %lx:%lx error %lx cr2 
0x%lx\n",
-- 
2.5.5


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH v5 2/9] x86/head: Move the early NMI fixup into C

2016-04-02 Thread Andy Lutomirski
C is nicer than asm.

Signed-off-by: Andy Lutomirski 
---
 arch/x86/kernel/head_32.S | 7 ---
 arch/x86/kernel/head_64.S | 6 --
 arch/x86/mm/extable.c | 5 +
 3 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index bef3e6c49b56..5e6ce845813a 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -555,9 +555,6 @@ early_idt_handler_common:
 */
cld
 
-   cmpl $2,(%esp)  # X86_TRAP_NMI
-   je .Lis_nmi # Ignore NMI
-
cmpl $2,%ss:early_recursion_flag
je hlt_loop
incl %ss:early_recursion_flag
@@ -636,10 +633,6 @@ early_idt_handler_common:
 hlt_loop:
hlt
jmp hlt_loop
-
-.Lis_nmi:
-   addl $8,%esp/* drop vector number and error code */
-   iret
 ENDPROC(early_idt_handler_common)
 
 /* This is the default interrupt "handler" :-) */
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index f8d6dad41e8d..af87896b6a23 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -354,9 +354,6 @@ early_idt_handler_common:
 */
cld
 
-   cmpl $2,(%rsp)  # X86_TRAP_NMI
-   je .Lis_nmi # Ignore NMI
-
cmpl $2,early_recursion_flag(%rip)
jz  1f
incl early_recursion_flag(%rip)
@@ -425,9 +422,6 @@ early_idt_handler_common:
 20:/* Exception table entry found or page table generated */
decl early_recursion_flag(%rip)
jmp restore_regs_and_iret
-.Lis_nmi:
-   addq $16,%rsp   # drop vector number and error code
-   INTERRUPT_RETURN
 ENDPROC(early_idt_handler_common)
 
__INITDATA
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 176e48de25d4..d6e4e6fb4002 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -2,6 +2,7 @@
 #include 
 #include 
 #include 
+#include 
 
 typedef bool (*ex_handler_t)(const struct exception_table_entry *,
struct pt_regs *, int);
@@ -96,6 +97,10 @@ int __init early_fixup_exception(struct pt_regs *regs, int 
trapnr)
unsigned long new_ip;
ex_handler_t handler;
 
+   /* Ignore early NMIs. */
+   if (trapnr == X86_TRAP_NMI)
+   return 1;
+
e = search_exception_tables(regs->ip);
if (!e)
return 0;
-- 
2.5.5


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH v5 1/9] x86/head: Pass a real pt_regs and trapnr to early_fixup_exception

2016-04-02 Thread Andy Lutomirski
early_fixup_exception is limited by the fact that it doesn't have a
real struct pt_regs.  Change both the 32-bit and 64-bit asm and the
C code to pass and accept a real pt_regs.

Signed-off-by: Andy Lutomirski 
---
 arch/x86/include/asm/uaccess.h |  2 +-
 arch/x86/kernel/head_32.S  | 74 +-
 arch/x86/kernel/head_64.S  | 68 --
 arch/x86/mm/extable.c  |  6 ++--
 4 files changed, 92 insertions(+), 58 deletions(-)

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index c0f27d7ea7ff..a3afb7259751 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -111,7 +111,7 @@ struct exception_table_entry {
 
 extern int fixup_exception(struct pt_regs *regs, int trapnr);
 extern bool ex_has_fault_handler(unsigned long ip);
-extern int early_fixup_exception(unsigned long *ip);
+extern int early_fixup_exception(struct pt_regs *regs, int trapnr);
 
 /*
  * These are the main single-value transfer routines.  They automatically
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 6bc9ae24b6d2..bef3e6c49b56 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -562,29 +562,64 @@ early_idt_handler_common:
je hlt_loop
incl %ss:early_recursion_flag
 
-   push %eax   # 16(%esp)
-   push %ecx   # 12(%esp)
-   push %edx   #  8(%esp)
-   push %ds#  4(%esp)
-   push %es#  0(%esp)
-   movl $(__KERNEL_DS),%eax
-   movl %eax,%ds
-   movl %eax,%es
+   /* The vector number is in pt_regs->gs */
 
-   cmpl $(__KERNEL_CS),32(%esp)
+   cld
+   pushl   %fs /* pt_regs->fs */
+   movw$0, 2(%esp) /* clear high bits (some CPUs leave garbage) */
+   pushl   %es /* pt_regs->es */
+   movw$0, 2(%esp) /* clear high bits (some CPUs leave garbage) */
+   pushl   %ds /* pt_regs->ds */
+   movw$0, 2(%esp) /* clear high bits (some CPUs leave garbage) */
+   pushl   %eax/* pt_regs->ax */
+   pushl   %ebp/* pt_regs->bp */
+   pushl   %edi/* pt_regs->di */
+   pushl   %esi/* pt_regs->si */
+   pushl   %edx/* pt_regs->dx */
+   pushl   %ecx/* pt_regs->cx */
+   pushl   %ebx/* pt_regs->bx */
+
+   /* Fix up DS and ES */
+   movl$(__KERNEL_DS), %ecx
+   movl%ecx, %ds
+   movl%ecx, %es
+
+   /* Load the vector number into EDX */
+   movlPT_GS(%esp), %edx
+
+   /* Load GS into pt_regs->gs and clear high bits */
+   movw%gs, PT_GS(%esp)
+   movw$0, PT_GS+2(%esp)
+
+   cmpl $(__KERNEL_CS),PT_CS(%esp)
jne 10f
 
-   leal 28(%esp),%eax  # Pointer to %eip
-   call early_fixup_exception
-   andl %eax,%eax
-   jnz ex_entry/* found an exception entry */
+   movl%esp, %eax  /* args are pt_regs (EAX), trapnr (EDX) */
+   callearly_fixup_exception
+   andl%eax,%eax
+   jz  10f /* Exception wasn't fixed up */
+
+   popl%ebx/* pt_regs->bx */
+   popl%ecx/* pt_regs->cx */
+   popl%edx/* pt_regs->dx */
+   popl%esi/* pt_regs->si */
+   popl%edi/* pt_regs->di */
+   popl%ebp/* pt_regs->bp */
+   popl%eax/* pt_regs->ax */
+   popl%ds /* pt_regs->ds */
+   popl%es /* pt_regs->es */
+   popl%fs /* pt_regs->fs */
+   popl%gs /* pt_regs->gs */
+   decl%ss:early_recursion_flag
+   addl$4, %esp/* pop pt_regs->orig_ax */
+   iret
 
 10:
 #ifdef CONFIG_PRINTK
xorl %eax,%eax
-   movw %ax,2(%esp)/* clean up the segment values on some cpus */
-   movw %ax,6(%esp)
-   movw %ax,34(%esp)
+   movw %ax,PT_FS+2(%esp)  /* clean up the segment values on some cpus */
+   movw %ax,PT_DS+2(%esp)
+   movw %ax,PT_ES+2(%esp)
leal  40(%esp),%eax
pushl %eax  /* %esp before the exception */
pushl %ebx
@@ -602,13 +637,6 @@ hlt_loop:
hlt
jmp hlt_loop
 
-ex_entry:
-   pop %es
-   pop %ds
-   pop %edx
-   pop %ecx
-   pop %eax
-   decl %ss:early_recursion_flag
 .Lis_nmi:
addl $8,%esp/* drop vector number and error code */
iret
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index ffdc0e860390..f8d6dad41e8d 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include "../entry/calling.h"
 
 #ifd

[Xen-devel] [PATCH v5 6/9] x86/msr: Carry on after a non-"safe" MSR access fails

2016-04-02 Thread Andy Lutomirski
This demotes an OOPS and likely panic due to a failed non-"safe" MSR
access to a WARN_ONCE and, for RDMSR, a return value of zero.

To be clear, this type of failure should *not* happen.  This patch
exists to minimize the chance of nasty undebuggable failures
happening when a CONFIG_PARAVIRT=y bug in the non-"safe" MSR helpers
gets fixed.

Signed-off-by: Andy Lutomirski 
---
 arch/x86/include/asm/msr.h | 10 --
 arch/x86/mm/extable.c  | 27 +++
 2 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 93fb7c1cffda..1487054a1a70 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -92,7 +92,10 @@ static inline unsigned long long native_read_msr(unsigned 
int msr)
 {
DECLARE_ARGS(val, low, high);
 
-   asm volatile("rdmsr" : EAX_EDX_RET(val, low, high) : "c" (msr));
+   asm volatile("1: rdmsr\n"
+"2:\n"
+_ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_rdmsr_unsafe)
+: EAX_EDX_RET(val, low, high) : "c" (msr));
if (msr_tracepoint_active(__tracepoint_read_msr))
do_trace_read_msr(msr, EAX_EDX_VAL(val, low, high), 0);
return EAX_EDX_VAL(val, low, high);
@@ -119,7 +122,10 @@ static inline unsigned long long 
native_read_msr_safe(unsigned int msr,
 static inline void native_write_msr(unsigned int msr,
unsigned low, unsigned high)
 {
-   asm volatile("wrmsr" : : "c" (msr), "a"(low), "d" (high) : "memory");
+   asm volatile("1: wrmsr\n"
+"2:\n"
+_ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_wrmsr_unsafe)
+: : "c" (msr), "a"(low), "d" (high) : "memory");
if (msr_tracepoint_active(__tracepoint_read_msr))
do_trace_write_msr(msr, ((u64)high << 32 | low), 0);
 }
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 50dfe438bd91..98b5f45d9d79 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -50,6 +50,33 @@ bool ex_handler_ext(const struct exception_table_entry 
*fixup,
 }
 EXPORT_SYMBOL(ex_handler_ext);
 
+bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup,
+struct pt_regs *regs, int trapnr)
+{
+   WARN_ONCE(1, "unchecked MSR access error: RDMSR from 0x%x\n",
+ (unsigned int)regs->cx);
+
+   /* Pretend that the read succeeded and returned 0. */
+   regs->ip = ex_fixup_addr(fixup);
+   regs->ax = 0;
+   regs->dx = 0;
+   return true;
+}
+EXPORT_SYMBOL(ex_handler_rdmsr_unsafe);
+
+bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup,
+struct pt_regs *regs, int trapnr)
+{
+   WARN_ONCE(1, "unchecked MSR access error: WRMSR to 0x%x (tried to write 
0x%08x%08x)\n",
+ (unsigned int)regs->cx,
+ (unsigned int)regs->dx, (unsigned int)regs->ax);
+
+   /* Pretend that the write succeeded. */
+   regs->ip = ex_fixup_addr(fixup);
+   return true;
+}
+EXPORT_SYMBOL(ex_handler_wrmsr_unsafe);
+
 bool ex_has_fault_handler(unsigned long ip)
 {
const struct exception_table_entry *e;
-- 
2.5.5


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH v5 0/9] Improve non-"safe" MSR access failure handling

2016-04-02 Thread Andy Lutomirski
There are two parts here:

* FIRST PART: EARLY EXCEPTIONS *

The first few patches move some early panic code into C, add pt_regs
to early exception handling, and make fancy exception handlers work early.

* SECOND PART: MSRs *

Setting CONFIG_PARAVIRT=y has an unintended side effect: it silently
turns all rdmsr and wrmsr operations into the safe variants without
any checks that the operations actually succeed.

With CONFIG_PARAVIRT=n, unchecked MSR failures OOPS and probably
cause boot to fail if they happen before init starts.

Neither behavior is very good, and it's particularly unfortunate that
the behavior changes depending on CONFIG_PARAVIRT.

In particular, KVM guests might be unwittingly depending on the
PARAVIRT=y behavior because CONFIG_KVM_GUEST currently depends on
CONFIG_PARAVIRT, and, because accesses in that case are completely
unchecked, we wouldn't even see a warning.

This series changes the native behavior, regardless of
CONFIG_PARAVIRT.  A non-"safe" MSR failure will give an informative
warning once and will be fixed up -- native_read_msr will return
zero, and both reads and writes will continue where they left off.

If panic_on_oops is set, they will still OOPS and panic.

By using the shiny new custom exception handler infrastructure,
there should be no overhead on the success paths.

I didn't change the behavior on Xen, but, with this series applied,
it would be straightforward for the Xen maintainers to make the
corresponding change -- knowledge of whether the access is "safe" is
now propagated into the pvops.

Doing this is probably a prerequisite to sanely decoupling
CONFIG_KVM_GUEST and CONFIG_PARAVIRT, which would probably make
Arjan and the rest of the Clear Containers people happy :)

There's also room to reduce the code size of the "safe" variants
using custom exception handlers in the future.

Changes from v4:
 - Fix a missing \n (Joe Perches)
 - No more panic_on_oops
 - Works early

Changes from v3:
 - WARN_ONCE instead of WARN (Ingo)
 - In the warning text, s/unsafe/unchecked/ (Ingo, sort of)

Changes from earlier versions: lots of changes!

Andy Lutomirski (9):
  x86/head: Pass a real pt_regs and trapnr to early_fixup_exception
  x86/head: Move the early NMI fixup into C
  x86/head: Move early exception panic code into early_fixup_exception
  x86/traps: Enable all exception handler callbacks early
  x86/paravirt: Add _safe to the read_msr and write_msr PV hooks
  x86/msr: Carry on after a non-"safe" MSR access fails
  x86/paravirt: Add paravirt_{read,write}_msr
  x86/paravirt: Make "unsafe" MSR accesses unsafe even if PARAVIRT=y
  x86/msr: Set the return value to zero when native_rdmsr_safe fails

 arch/x86/include/asm/msr.h|  20 --
 arch/x86/include/asm/paravirt.h   |  45 +++--
 arch/x86/include/asm/paravirt_types.h |  14 ++--
 arch/x86/include/asm/uaccess.h|   2 +-
 arch/x86/kernel/head_32.S | 116 ++
 arch/x86/kernel/head_64.S |  95 
 arch/x86/kernel/paravirt.c|   6 +-
 arch/x86/mm/extable.c |  64 +++
 arch/x86/xen/enlighten.c  |  27 +++-
 9 files changed, 208 insertions(+), 181 deletions(-)

-- 
2.5.5


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH v5 7/9] x86/paravirt: Add paravirt_{read, write}_msr

2016-04-02 Thread Andy Lutomirski
This adds paravirt hooks for unsafe MSR access.  On native, they
call native_{read,write}_msr.  On Xen, they use
xen_{read,write}_msr_safe.

Nothing uses them yet for ease of bisection.  The next patch will
use them in rdmsrl, wrmsrl, etc.

I intentionally didn't make them warn on #GP on Xen.  I think that
should be done separately by the Xen maintainers.

Signed-off-by: Andy Lutomirski 
---
 arch/x86/include/asm/msr.h|  5 +++--
 arch/x86/include/asm/paravirt.h   | 11 +++
 arch/x86/include/asm/paravirt_types.h | 10 --
 arch/x86/kernel/paravirt.c|  2 ++
 arch/x86/xen/enlighten.c  | 23 +++
 5 files changed, 47 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 1487054a1a70..13da359881d7 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -119,8 +119,9 @@ static inline unsigned long long 
native_read_msr_safe(unsigned int msr,
return EAX_EDX_VAL(val, low, high);
 }
 
-static inline void native_write_msr(unsigned int msr,
-   unsigned low, unsigned high)
+/* Can be uninlined because referenced by paravirt */
+notrace static inline void native_write_msr(unsigned int msr,
+   unsigned low, unsigned high)
 {
asm volatile("1: wrmsr\n"
 "2:\n"
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 2e49228ed9a3..68297d87e85c 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -129,6 +129,17 @@ static inline void wbinvd(void)
 
 #define get_kernel_rpl()  (pv_info.kernel_rpl)
 
+static inline u64 paravirt_read_msr(unsigned msr)
+{
+   return PVOP_CALL1(u64, pv_cpu_ops.read_msr, msr);
+}
+
+static inline void paravirt_write_msr(unsigned msr,
+ unsigned low, unsigned high)
+{
+   return PVOP_VCALL3(pv_cpu_ops.write_msr, msr, low, high);
+}
+
 static inline u64 paravirt_read_msr_safe(unsigned msr, int *err)
 {
return PVOP_CALL2(u64, pv_cpu_ops.read_msr_safe, msr, err);
diff --git a/arch/x86/include/asm/paravirt_types.h 
b/arch/x86/include/asm/paravirt_types.h
index 5a06cccd36f0..b85aec45a1b8 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -155,8 +155,14 @@ struct pv_cpu_ops {
void (*cpuid)(unsigned int *eax, unsigned int *ebx,
  unsigned int *ecx, unsigned int *edx);
 
-   /* MSR operations.
-  err = 0/-EIO.  wrmsr returns 0/-EIO. */
+   /* Unsafe MSR operations.  These will warn or panic on failure. */
+   u64 (*read_msr)(unsigned int msr);
+   void (*write_msr)(unsigned int msr, unsigned low, unsigned high);
+
+   /*
+* Safe MSR operations.
+* read sets err to 0 or -EIO.  write returns 0 or -EIO.
+*/
u64 (*read_msr_safe)(unsigned int msr, int *err);
int (*write_msr_safe)(unsigned int msr, unsigned low, unsigned high);
 
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 8aad95478ae5..f9583917c7c4 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -339,6 +339,8 @@ __visible struct pv_cpu_ops pv_cpu_ops = {
.write_cr8 = native_write_cr8,
 #endif
.wbinvd = native_wbinvd,
+   .read_msr = native_read_msr,
+   .write_msr = native_write_msr,
.read_msr_safe = native_read_msr_safe,
.write_msr_safe = native_write_msr_safe,
.read_pmc = native_read_pmc,
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index ff2df20d0067..548cd3e02b9e 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1092,6 +1092,26 @@ static int xen_write_msr_safe(unsigned int msr, unsigned 
low, unsigned high)
return ret;
 }
 
+static u64 xen_read_msr(unsigned int msr)
+{
+   /*
+* This will silently swallow a #GP from RDMSR.  It may be worth
+* changing that.
+*/
+   int err;
+
+   return xen_read_msr_safe(msr, &err);
+}
+
+static void xen_write_msr(unsigned int msr, unsigned low, unsigned high)
+{
+   /*
+* This will silently swallow a #GP from WRMSR.  It may be worth
+* changing that.
+*/
+   xen_write_msr_safe(msr, low, high);
+}
+
 void xen_setup_shared_info(void)
 {
if (!xen_feature(XENFEAT_auto_translated_physmap)) {
@@ -1222,6 +1242,9 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
 
.wbinvd = native_wbinvd,
 
+   .read_msr = xen_read_msr,
+   .write_msr = xen_write_msr,
+
.read_msr_safe = xen_read_msr_safe,
.write_msr_safe = xen_write_msr_safe,
 
-- 
2.5.5


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v5 0/9] Improve non-"safe" MSR access failure handling

2016-04-02 Thread Andy Lutomirski
On Sat, Apr 2, 2016 at 7:24 AM, Linus Torvalds
 wrote:
> This patch series looks much nicer than the last one. I assume you
> tested that the early-trap handling actually worked too? I only looked
> at the patches..
>
> Ack to it all,

I injected some BUGs in various places on 32-bit an 64-bit and it
seemed to do the right thing.  Depending on how early they were, I
either got a clean hang or I got a printout, and whether it displayed
anything didn't seem to change with and without the patches.  I think
that early_printk doesn't work from the very very beginning.

I also tried a bad wrmsrl at a couple early points.  Very very early
it just works with not warning.  A little later and it prints the
warning.

--Andy

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] Getting rid of inside_vm in intel8x0

2016-04-02 Thread Andy Lutomirski
On Apr 2, 2016 12:07 PM, "Takashi Iwai"  wrote:
>
> On Sat, 02 Apr 2016 14:57:44 +0200,
> Andy Lutomirski wrote:
> >
> > On Fri, Apr 1, 2016 at 10:33 PM, Takashi Iwai  wrote:
> > > On Sat, 02 Apr 2016 00:28:31 +0200,
> > > Luis R. Rodriguez wrote:
> > >> If the former, could a we somehow detect an emulated device other than 
> > >> through
> > >> this type of check ? Or could we *add* a capability of some sort to 
> > >> detect it
> > >> on the driver ? This would not address the removal, but it could mean 
> > >> finding a
> > >> way to address emulation issues.
> > >>
> > >> If its an IO issue -- exactly what is the causing the delays in IO ?
> > >
> > > Luis, there is no problem about emulation itself.  It's rather an
> > > optimization to lighten the host side load, as I/O access on a VM is
> > > much heavier.
> > >
> > >> > > > This is satisfied mostly only on VM, and can't
> > >> > > > be measured easily unlike the IO read speed.
> > >> > >
> > >> > > Interesting, note the original patch claimed it was for KVM and
> > >> > > Parallels hypervisor only, but since the code uses:
> > >> > >
> > >> > > +#if defined(__i386__) || defined(__x86_64__)
> > >> > > +   inside_vm = inside_vm || 
> > >> > > boot_cpu_has(X86_FEATURE_HYPERVISOR);
> > >> > > +#endif
> > >> > >
> > >> > > This makes it apply also to Xen as well, this makes this hack more
> > >> > > broad, but does is it only applicable when an emulated device is
> > >> > > used ? What about if a hypervisor is used and PCI passthrough is
> > >> > > used ?
> > >> >
> > >> > A good question.  Xen was added there at the time from positive
> > >> > results by quick tests, but it might show an issue if it's running on
> > >> > a very old chip with PCI passthrough.  But I'm not sure whether PCI
> > >> > passthrough would work on such old chipsets at all.
> > >>
> > >> If it did have an issue then that would have to be special cased, that
> > >> is the module parameter would not need to be enabled for such type of
> > >> systems, and heuristics would be needed. As you note, fortunately this
> > >> may not be common though...
> > >
> > > Actually this *is* module parametered.  If set to a boolean value, it
> > > can be applied / skipped forcibly.  So, if there has been a problem on
> > > Xen, this should have been reported.  That's why I wrote it's no
> > > common case.  This comes from the real experience.
> > >
> > >> but if this type of work around may be
> > >> taken as a precedent to enable other types of hacks in other drivers
> > >> I'm very fearful of more hacks later needing these considerations as
> > >> well.
> > >>
> > >> > > > > There are a pile of nonsensical "are we in a VM" checks of 
> > >> > > > > various
> > >> > > > > sorts scattered throughout the kernel, they're all a mess to 
> > >> > > > > maintain
> > >> > > > > (there are lots of kinds of VMs in the world, and Linux may not 
> > >> > > > > even
> > >> > > > > know it's a guest), and, in most cases, it appears that the 
> > >> > > > > correct
> > >> > > > > solution is to delete the checks.  I just removed a nasty one in 
> > >> > > > > the
> > >> > > > > x86_32 entry asm, and this one is written in C so it should be a 
> > >> > > > > piece
> > >> > > > > of cake :)
> > >> > > >
> > >> > > > This cake looks sweet, but a worm is hidden behind the cream.
> > >> > > > The loop in the code itself is already a kludge for the buggy 
> > >> > > > hardware
> > >> > > > where the inconsistent read happens not so often (only at the 
> > >> > > > boundary
> > >> > > > and in a racy way).  It would be nice if we can have a more 
> > >> > > > reliably
> > >> > > > way to know the hardware buggyness, but it's difficult,
>

Re: [Xen-devel] [PATCH v5 3/9] x86/head: Move early exception panic code into early_fixup_exception

2016-04-02 Thread Andy Lutomirski
On Sat, Apr 2, 2016 at 11:39 AM, Borislav Petkov  wrote:
> On Sat, Apr 02, 2016 at 07:01:34AM -0700, Andy Lutomirski wrote:
>> This removes a bunch of assembly and adds some C code instead.  It
>> changes the actual printouts on both 32-bit and 64-bit kernels, but
>> they still seem okay.
>>
>> Signed-off-by: Andy Lutomirski 
>> ---
>>  arch/x86/include/asm/uaccess.h |  2 +-
>>  arch/x86/kernel/head_32.S  | 49 
>> +-
>>  arch/x86/kernel/head_64.S  | 45 ++
>>  arch/x86/mm/extable.c  | 29 -
>>  4 files changed, 32 insertions(+), 93 deletions(-)
>
> ...
>
>> @@ -99,21 +101,38 @@ int __init early_fixup_exception(struct pt_regs *regs, 
>> int trapnr)
>>
>>   /* Ignore early NMIs. */
>>   if (trapnr == X86_TRAP_NMI)
>> - return 1;
>> + return;
>> +
>> + if (early_recursion_flag > 2)
>> + goto halt_loop;
>> +
>> + if (regs->cs != __KERNEL_CS)
>> + goto fail;
>>
>>   e = search_exception_tables(regs->ip);
>>   if (!e)
>> - return 0;
>> + goto fail;
>>
>>   new_ip  = ex_fixup_addr(e);
>>   handler = ex_fixup_handler(e);
>>
>>   /* special handling not supported during early boot */
>>   if (handler != ex_handler_default)
>> - return 0;
>> + goto fail;
>>
>>   regs->ip = new_ip;
>> - return 1;
>> + return;
>> +
>> +fail:
>> + early_printk("PANIC: early exception 0x%02x IP %lx:%lx error %lx cr2 
>> 0x%lx\n",
>> +  (unsigned)trapnr, (unsigned long)regs->cs, regs->ip,
>> +  regs->orig_ax, read_cr2());
>> +
>> + show_regs(regs);
>
> To make this even better, it could be something called early_show_regs()
> or so and be a simplified version of __show_regs() on both bitness but
> which calls early_printk().
>
> This way you'll be able to get out stuff to the console as early as
> possible.
>
> Btw, you don't need to dump rIP, CR2, etc in the PANIC message above
> since you're going to early_show_regs() anyway.

Given that I this isn't really a regression with my patches (it
probably never worked much better on 32-bit and the regs never would
have shown at all on 64-bit), I propose a different approach: make
printk work earlier.  Something like:

if (early) {
early_printk(args);
}

or early_vprintk or whatever.

If the cost of a branch mattered, this could be alternative-patched
out later on, but that seems silly.  I also bet that a more sensible
fallback could be created in which printk would try to use an early
console if there's no real console.

--Andy

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v5 4/9] x86/traps: Enable all exception handler callbacks early

2016-04-02 Thread Andy Lutomirski
On Sat, Apr 2, 2016 at 11:52 AM, Borislav Petkov  wrote:
> On Sat, Apr 02, 2016 at 07:01:35AM -0700, Andy Lutomirski wrote:
>> Now that early_fixup_exception has pt_regs, we can just call
>> fixup_exception from it.  This will make fancy exception handlers
>> work early.
>>
>> Signed-off-by: Andy Lutomirski 
>> ---
>>  arch/x86/mm/extable.c | 19 ++-
>>  1 file changed, 2 insertions(+), 17 deletions(-)
>>
>> diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
>> index 8997022abebc..50dfe438bd91 100644
>> --- a/arch/x86/mm/extable.c
>> +++ b/arch/x86/mm/extable.c
>> @@ -95,10 +95,6 @@ extern unsigned int early_recursion_flag;
>>  /* Restricted version used during very early boot */
>>  void __init early_fixup_exception(struct pt_regs *regs, int trapnr)
>>  {
>> - const struct exception_table_entry *e;
>> - unsigned long new_ip;
>> - ex_handler_t handler;
>> -
>>   /* Ignore early NMIs. */
>>   if (trapnr == X86_TRAP_NMI)
>>   return;
>> @@ -109,19 +105,8 @@ void __init early_fixup_exception(struct pt_regs *regs, 
>> int trapnr)
>>   if (regs->cs != __KERNEL_CS)
>>   goto fail;
>>
>> - e = search_exception_tables(regs->ip);
>> - if (!e)
>> - goto fail;
>> -
>> - new_ip  = ex_fixup_addr(e);
>> - handler = ex_fixup_handler(e);
>> -
>> - /* special handling not supported during early boot */
>> - if (handler != ex_handler_default)
>> - goto fail;
>
> Hold on, what happened to the uaccess handling not being supported
> during early boot?
>
> So before Tony changed it, the original code had:
>
>
>  /* Restricted version used during very early boot */
>  int __init early_fixup_exception(unsigned long *ip)
>  {
>
> ...
> -   if (fixup->fixup - fixup->insn >= 0x7ff0 - 4) {
> -   /* uaccess handling not supported during early boot */
> -   return 0;
> -   }
>
> I'm guessing that wasn't supported early, probably because some stuff
> wasn't initialized yet. Our normal, late fixup is by doing:
>
> current_thread_info()->uaccess_err = 1;
>
> and I'm assuming we can't do that early. current_thread_info is probably
> not setup yet...
>
>> -
>> - regs->ip = new_ip;
>> - return;
>> + if (fixup_exception(regs, trapnr))
>> + return;
>
> So why can we do it now, all of a sudden?

I have no idea why it was explicitly unsupported, but I'm guessing it
was just to avoid duplicating the code.  Early "ext" uaccess failures
are certainly not going to work, but I don't think this is a problem
-- there's no userspace before trap_init runs, so how exactly is an
"ext" uaccess going to happen in the first place?

In any event, if it did happen in older kernels, it would have
immediately panicked due to that code.  At least with my code it just
might manage to EFAULT correctly.

--Andy

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v5 3/9] x86/head: Move early exception panic code into early_fixup_exception

2016-04-02 Thread Andy Lutomirski
[cc Jan Kara]

On Sat, Apr 2, 2016 at 1:47 PM, Borislav Petkov  wrote:
> On Sat, Apr 02, 2016 at 01:13:37PM -0700, Andy Lutomirski wrote:
>> Given that I this isn't really a regression with my patches (it
>> probably never worked much better on 32-bit and the regs never would
>> have shown at all on 64-bit),
>
> You're right. That thing calls printk *and* early_printk, WTF:
>
> #ifdef CONFIG_EARLY_PRINTK
>
> call early_printk
> ...
>
> call dump_stack
>
> ...
>
> call __print_symbol
>
> those last two call printk. Great.
>
>> I propose a different approach: make
>> printk work earlier.  Something like:
>>
>> if (early) {
>> early_printk(args);
>> }
>>
>> or early_vprintk or whatever.
>>
>> If the cost of a branch mattered, this could be alternative-patched
>> out later on, but that seems silly.  I also bet that a more sensible
>> fallback could be created in which printk would try to use an early
>> console if there's no real console.
>
> So how about this:
>
> printk() does
>
> vprintk_func = this_cpu_read(printk_func);
>
> and that's
>
> DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default
>
> I guess we can make that function be early_printk-something and once
> printk is initialized, we overwrite it with vprintk_default.
>
> Elegant and no need for if branches and alternatives.
>
> Hmmm.

Jan, IIRC you were looking at printk recently-ish.  Any thoughts here?

--Andy

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v5 4/9] x86/traps: Enable all exception handler callbacks early

2016-04-03 Thread Andy Lutomirski
On Sun, Apr 3, 2016 at 1:07 AM, Borislav Petkov  wrote:
> On Sat, Apr 02, 2016 at 10:52:48PM +0200, Borislav Petkov wrote:
>> On Sat, Apr 02, 2016 at 01:16:07PM -0700, Andy Lutomirski wrote:
>> > I have no idea why it was explicitly unsupported, but I'm guessing it
>> > was just to avoid duplicating the code.  Early "ext" uaccess failures
>> > are certainly not going to work, but I don't think this is a problem
>> > -- there's no userspace before trap_init runs, so how exactly is an
>> > "ext" uaccess going to happen in the first place?
>> >
>> > In any event, if it did happen in older kernels, it would have
>> > immediately panicked due to that code.  At least with my code it just
>> > might manage to EFAULT correctly.
>>
>> Yeah, I was wondering what that early thing meant.
>>
>> Linus or tip guys probably remember what this whole deal with early
>> uaccess was about. I'll try to do some git archeology tomorrow.
>
> Yep, just as I suspected:
>
> 6a1ea279c210 ("x86, extable: Add early_fixup_exception()")
>
> Apparently, thread_info might not have been setup yet. I'm guessing the
> intention behind this no-uaccess-fixup-early is to not even attempt any
> fixup due to stuff *probably* not initialized yet and so the safer thing
> would be to panic instead.
>
> I'm wondering whether making it try to EFAULT correctly is the right
> thing to do... We're certainly more conservative if we panic and not
> allow some silently failed attempt at recovery which looks successful,
> to continue.
>
> hpa, thoughts?

I don't think this matters much.  There aren't many users of this
mechanism in the tree:

./arch/x86/kernel/signal.c:get_user_try {
./arch/x86/kernel/signal.c:put_user_try {
./arch/x86/kernel/signal.c:put_user_try {
./arch/x86/kernel/signal.c:put_user_try {
./arch/x86/kernel/signal.c:put_user_try {
./arch/x86/kernel/signal_compat.c:put_user_try {
./arch/x86/kernel/signal_compat.c:get_user_try {
./arch/x86/kernel/vm86_32.c:put_user_try {
./arch/x86/kernel/vm86_32.c:get_user_try {
./arch/x86/ia32/ia32_signal.c:get_user_try {
./arch/x86/ia32/ia32_signal.c:put_user_try {
./arch/x86/ia32/ia32_signal.c:put_user_try {
./arch/x86/ia32/ia32_signal.c:put_user_try {
./arch/x86/include/asm/uaccess.h: * {get|put}_user_try and catch
./arch/x86/include/asm/uaccess.h: * get_user_try {
./arch/x86/include/asm/uaccess.h:#define get_user_tryuaccess_try
./arch/x86/include/asm/uaccess.h:#define put_user_tryuaccess_try

I don't see how we could get to that code in the first place without
current_thread_info() working.

If we can ever convince gcc to do jump labels properly for uaccess, it
would probably be better to just delete all that code.

--Andy

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v5 5/9] x86/paravirt: Add _safe to the read_msr and write_msr PV hooks

2016-04-03 Thread Andy Lutomirski
On Sun, Apr 3, 2016 at 1:41 AM, Borislav Petkov  wrote:
> On Sat, Apr 02, 2016 at 07:01:36AM -0700, Andy Lutomirski wrote:
>> These hooks match the _safe variants, so name them accordingly.
>> This will make room for unsafe PV hooks.
>>
>> Signed-off-by: Andy Lutomirski 
>> ---
>>  arch/x86/include/asm/paravirt.h   | 33 +
>>  arch/x86/include/asm/paravirt_types.h |  8 
>>  arch/x86/kernel/paravirt.c|  4 ++--
>>  arch/x86/xen/enlighten.c  |  4 ++--
>>  4 files changed, 25 insertions(+), 24 deletions(-)
>
> ...
>
>> diff --git a/arch/x86/include/asm/paravirt_types.h 
>> b/arch/x86/include/asm/paravirt_types.h
>> index 77db5616a473..5a06cccd36f0 100644
>> --- a/arch/x86/include/asm/paravirt_types.h
>> +++ b/arch/x86/include/asm/paravirt_types.h
>> @@ -155,10 +155,10 @@ struct pv_cpu_ops {
>>   void (*cpuid)(unsigned int *eax, unsigned int *ebx,
>> unsigned int *ecx, unsigned int *edx);
>>
>> - /* MSR, PMC and TSR operations.
>> -err = 0/-EFAULT.  wrmsr returns 0/-EFAULT. */
>> - u64 (*read_msr)(unsigned int msr, int *err);
>> - int (*write_msr)(unsigned int msr, unsigned low, unsigned high);
>> + /* MSR operations.
>> +err = 0/-EIO.  wrmsr returns 0/-EIO. */
>
> Please reformat this comment properly, while you're at it:
>
> /*
>  * A sentence.
>  * Another sentence.
>  */

You already caught that one.  It's fixed in "x86/paravirt: Add
paravirt_{read,write}_msr".

Congrats on being deterministic :)

--Andy

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v5 4/9] x86/traps: Enable all exception handler callbacks early

2016-04-03 Thread Andy Lutomirski
On Sun, Apr 3, 2016 at 6:51 AM, Linus Torvalds
 wrote:
> On Sun, Apr 3, 2016 at 3:07 AM, Borislav Petkov  wrote:
>>
>> I'm wondering whether making it try to EFAULT correctly is the right
>> thing to do... We're certainly more conservative if we panic and not
>> allow some silently failed attempt at recovery which looks successful,
>> to continue.
>
> No, please don't fail at early boot.
>
> Early boot is just about the *worst* situation to try to debug odd
> failures, exactly since things like printk may not be reliable, and
> things won't get logged etc.
>
> So particularly during early boot we should try as hard as possible
> not to crash - even if it means not being able to log about a problem.
> At least that way you have a hopefully working machine and can *maybe*
> debug things.
>

In this regard, at least, my patch is the right approach.  Calling the
handler, whatever it is, is less likely to panic than refusing to call
it.

--Andy

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v5 3/9] x86/head: Move early exception panic code into early_fixup_exception

2016-04-04 Thread Andy Lutomirski
On Apr 4, 2016 4:51 AM, "Jan Kara"  wrote:
>
> On Sat 02-04-16 13:58:19, Andy Lutomirski wrote:
> > [cc Jan Kara]
> >
> > On Sat, Apr 2, 2016 at 1:47 PM, Borislav Petkov  wrote:
> > > On Sat, Apr 02, 2016 at 01:13:37PM -0700, Andy Lutomirski wrote:
> > >> Given that I this isn't really a regression with my patches (it
> > >> probably never worked much better on 32-bit and the regs never would
> > >> have shown at all on 64-bit),
> > >
> > > You're right. That thing calls printk *and* early_printk, WTF:
> > >
> > > #ifdef CONFIG_EARLY_PRINTK
> > >
> > > call early_printk
> > > ...
> > >
> > > call dump_stack
> > >
> > > ...
> > >
> > > call __print_symbol
> > >
> > > those last two call printk. Great.
> > >
> > >> I propose a different approach: make
> > >> printk work earlier.  Something like:
> > >>
> > >> if (early) {
> > >> early_printk(args);
> > >> }
> > >>
> > >> or early_vprintk or whatever.
> > >>
> > >> If the cost of a branch mattered, this could be alternative-patched
> > >> out later on, but that seems silly.  I also bet that a more sensible
> > >> fallback could be created in which printk would try to use an early
> > >> console if there's no real console.
> > >
> > > So how about this:
> > >
> > > printk() does
> > >
> > > vprintk_func = this_cpu_read(printk_func);
> > >
> > > and that's
> > >
> > > DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default
> > >
> > > I guess we can make that function be early_printk-something and once
> > > printk is initialized, we overwrite it with vprintk_default.
> > >
> > > Elegant and no need for if branches and alternatives.
> > >
> > > Hmmm.
> >
> > Jan, IIRC you were looking at printk recently-ish.  Any thoughts here?
>
> Sounds like a good idea to me. I've also consulted this with Petr Mladek
> (added to CC) who is using printk_func per-cpu variable in his
> printk-from-NMI patches and he also doesn't see a problem with this.
>
> I was just wondering about one thing - this way we add more early printks
> if I understand your intention right. Are we guaranteed that they happen
> only from a single CPU? Because currently there is no locking in
> early_printk() and thus we can end up writing to early console several
> messages in parallel from different CPUs. Not sure what's going to happen
> in that case...

Adding locking would be easy enough, wouldn't it?

But do any platforms really boot a second CPU before switching to real
printk?  Given that I see all the smpboot stuff in dmesg, I guess real
printk happens first.  I admit I haven't actually checked.

--Andy

>
> Honza
> --
> Jan Kara 
> SUSE Labs, CR

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH] x86/extable: Add a comment about early exception handlers

2016-04-04 Thread Andy Lutomirski
Borislav asked for a comment explaining why all exception handlers are
allowed early.

Signed-off-by: Andy Lutomirski 
---
 arch/x86/mm/extable.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 98b5f45d9d79..36fe03bc81ee 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -132,6 +132,20 @@ void __init early_fixup_exception(struct pt_regs *regs, 
int trapnr)
if (regs->cs != __KERNEL_CS)
goto fail;
 
+   /*
+* The full exception fixup machinery is available as soon as
+* the early IDT is loaded.  This means that it is the
+* responsibility of extable users to either function correctly
+* when handlers are invoked early or to simply avoid causing
+* exceptions before they're ready to handle them.
+*
+* This is better than filtering which handlers can be used,
+* because refusing to call a handler here is guaranteed to
+* result in a hard-to-debug panic.
+*
+* Keep in mind that not all vectors actually get here.  Early
+* fage faults, for example, are special.
+*/
if (fixup_exception(regs, trapnr))
return;
 
-- 
2.5.5


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v5 4/9] x86/traps: Enable all exception handler callbacks early

2016-04-04 Thread Andy Lutomirski
On Sun, Apr 3, 2016 at 7:10 AM, Borislav Petkov  wrote:
> On Sun, Apr 03, 2016 at 06:55:00AM -0700, Andy Lutomirski wrote:
>> > No, please don't fail at early boot.
>> >
>> > Early boot is just about the *worst* situation to try to debug odd
>> > failures, exactly since things like printk may not be reliable, and
>> > things won't get logged etc.
>> >
>> > So particularly during early boot we should try as hard as possible
>> > not to crash - even if it means not being able to log about a problem.
>> > At least that way you have a hopefully working machine and can *maybe*
>> > debug things.
>> >
>>
>> In this regard, at least, my patch is the right approach.  Calling the
>> handler, whatever it is, is less likely to panic than refusing to call
>> it.
>
> Ok, good.
>
> But can we pretty please document this whole situation, i.e., the
> fact that we're trying really really hard not to fail early boot for
> debuggability reasons - either in the commit message or better in the
> code, for future reference. I think this is an important aspect to hold
> down.

I emailed out a followup patch to add a comment.

--Andy

>
> Thanks.
>
> --
> Regards/Gruss,
> Boris.
>
> ECO tip #101: Trim your mails when you reply.



-- 
Andy Lutomirski
AMA Capital Management, LLC

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v5 7/9] x86/paravirt: Add paravirt_{read, write}_msr

2016-04-04 Thread Andy Lutomirski
On Mon, Apr 4, 2016 at 9:33 AM, David Vrabel  wrote:
> On 02/04/16 15:01, Andy Lutomirski wrote:
>> This adds paravirt hooks for unsafe MSR access.  On native, they
>> call native_{read,write}_msr.  On Xen, they use
>> xen_{read,write}_msr_safe.
>>
>> Nothing uses them yet for ease of bisection.  The next patch will
>> use them in rdmsrl, wrmsrl, etc.
>>
>> I intentionally didn't make them warn on #GP on Xen.  I think that
>> should be done separately by the Xen maintainers.
> ...
>> --- a/arch/x86/xen/enlighten.c
>> +++ b/arch/x86/xen/enlighten.c
>
> Reviewed-by: David Vrabel 
>
>> @@ -1092,6 +1092,26 @@ static int xen_write_msr_safe(unsigned int msr, 
>> unsigned low, unsigned high)
>>   return ret;
>>  }
>>
>> +static u64 xen_read_msr(unsigned int msr)
>> +{
>> + /*
>> +  * This will silently swallow a #GP from RDMSR.  It may be worth
>> +  * changing that.
>
> This probably isn't important.
>

It might be nice to do a WARN_ON_ONCE like this series does for the
native case to help shake out latent bugs.

--Andy

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v5 3/9] x86/head: Move early exception panic code into early_fixup_exception

2016-04-04 Thread Andy Lutomirski
On Mon, Apr 4, 2016 at 12:38 PM, Borislav Petkov  wrote:
> On Mon, Apr 04, 2016 at 06:00:42PM +0200, Peter Zijlstra wrote:
>> On Mon, Apr 04, 2016 at 08:32:21AM -0700, Andy Lutomirski wrote:
>>
>> > Adding locking would be easy enough, wouldn't it?
>>
>> See patch in this thread..
>>
>> > But do any platforms really boot a second CPU before switching to real
>> > printk?
>>
>> I _only_ use early_printk() as printk() is a quagmire of fail :-)
>
> And since I'm the king of minimalistic changes... this below
> works. However, the problem is that we need to pull up all that
> early_param parsing in order to enable the early console, i.e.
> "earlyprintk=ttyS0,115200" cmdline parsing.
>
> And we can do all that after having setup the IDT. So I'd need to do
> some early dancing with cmdline_find_option_bool(boot_command_line, ...
> in asm or so. Need to think about it more.
>

I'd be nervous about moving early param parsing, especially as part of
the same patch.

Could you do it by moving just the earlyprintk stuff a la
fpu__init_parse_early_param()?

--Andy

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] Adding Xen to the kbuild bot?

2016-04-04 Thread Andy Lutomirski
On Wed, Feb 17, 2016 at 12:46 PM, Andy Lutomirski  wrote:
> On Fri, Feb 5, 2016 at 5:17 PM, Fengguang Wu  wrote:
>> On Fri, Feb 05, 2016 at 12:10:56PM -0800, Andy Lutomirski wrote:
>>> On Feb 4, 2016 7:11 PM, "Fengguang Wu"  wrote:
>>> >
>>> > Hi Andy,
>>> >
>>> > CC more people on Xen testing -- in case OSStest already (or plans to)
>>> > cover such test case.
>>> >
>>> > On Tue, Feb 02, 2016 at 07:31:30PM -0800, Andy Lutomirski wrote:
>>> > > Hi all-
>>> > >
>>> > > Would it make sense to add some basic Xen PV testing to the kbuild bot?
>>> >
>>> > Do you mean to run basic Xen testing on the various kernel trees that
>>> > 0day robot covers? That is, to catch kernel regressions when running
>>> > under Xen.
>>> >
>>>
>>> Yes, exactly.  I've personally broken Linux as a Xen guest at least twice.
>>>
>>> > If the intention is to catch Xen regressions, the OSStest
>>> > infrastructure may be a better option:
>>> >
>>> > git://xenbits.xen.org/osstest.git
>>>
>>> No, I think that 0day should pick one Xen version and stick with it
>>> for a while rather than trying to track the latest version.
>>
>> OK, got it. So it's suitable to run in 0day.
>>
>>> > > qemu can boot Xen like this:
>>> > >
>>> > > qemu-system-x86_64 -kernel path/to/xen-4.5.2 -initrd 'path/to/bzImage
>>> > > kernelarg otherkernelarg=value" -append 'xenarg other_xen_arg'
>>> > >
>>> > > This should work with any kernel image for x86 or x86_64 with 
>>> > > CONFIG_XEN=y.
>>> >
>>> > Got it. If you have simple working test scripts to illustrate test
>>> > details, it'd be a great help for integrating into OSStest or 0day.
>>>
>>> I have a script that will boot to a command prompt, but I don't know
>>> if that's the right way to do it.  I'm not really sure how 0day works
>>> under the hood, but treating Xen as a different configuration or arch
>>> instead of treating it as a different test case might make more sense.
>>
>> We can check the script first, then determine the most suitable way to
>> integrate it into 0day. My guess is, it might be suitable to run as a
>> new kind of VM host, like this
>>
>> https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/tree/hosts/vm-kbuild-1G
>>
>> model: qemu-system-x86_64 -enable-kvm -cpu Haswell,+smep,+smap
>> nr_vm: 12
>> nr_cpu: 2
>> memory: 1G
>> disk_type: virtio-scsi
>> rootfs: debian-x86_64.cgz
>> hdd_partitions: /dev/sda /dev/sdb /dev/sdc /dev/sdd
>> swap_partitions: /dev/sde
>
> This makes sense to me, but I think it would need an extension to the
> configuration language.
>
> The guest virtio code should be in the next -next release.
>

FWIW, 4.6-rc1 works when booted via Xen on QEMU/KVM with virtio out of the box.

--Andy

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] Does __KERNEL_DS serve a purpose?

2016-04-07 Thread Andy Lutomirski
I can't see any reason that we need the __KERNEL_DS segment at all --
I think that everything that uses __KERNEL_DS could use __USER_DS
instead.  Am I missing anything?  This has been bugging me for a
while.

I mulled over this a bit when trying to understand the sysret_ss_attrs
bug and then forgot about it.

--Andy

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] Does __KERNEL_DS serve a purpose?

2016-04-08 Thread Andy Lutomirski
On Fri, Apr 8, 2016 at 1:01 AM, Andrew Cooper  wrote:
> On 08/04/2016 01:24, Andy Lutomirski wrote:
>> I can't see any reason that we need the __KERNEL_DS segment at all --
>> I think that everything that uses __KERNEL_DS could use __USER_DS
>> instead.  Am I missing anything?  This has been bugging me for a
>> while.
>>
>> I mulled over this a bit when trying to understand the sysret_ss_attrs
>> bug and then forgot about it.
>
> Linux doesn't have a separate __KERNEL_SS.  For the plain data segments,
> the dpl is not interesting.
>
> However, %ss is also loaded with __KERNEL_DS, and %ss.dpl is somewhat
> important.

But %ss can be loaded with 0 on 64-bit kernels.  (I assume that
loading 0 into %ss sets SS.DPL to 0 if done at CPL0, but I'm vague on
this, since it only really matters to hypervisor code AFAIK.)

32-bit kernels need __KERNEL_DS, I think.

--Andy

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] Does __KERNEL_DS serve a purpose?

2016-04-08 Thread Andy Lutomirski
On Fri, Apr 8, 2016 at 10:12 AM, Paolo Bonzini  wrote:
>
>
> On 08/04/2016 18:00, Andy Lutomirski wrote:
>> But %ss can be loaded with 0 on 64-bit kernels.  (I assume that
>> loading 0 into %ss sets SS.DPL to 0 if done at CPL0, but I'm vague on
>> this, since it only really matters to hypervisor code AFAIK.)
>
> It's even simpler, unless CPL=0 SS cannot be loaded with 0 while in a
> 64-bit code segment (SS can never be loaded with 0 if you're not in a
> 64-bit code segment).
>
> Thus indeed SS=0 implies SS.DPL=0 on 64-bit kernels.

I think we are stuck with __KERNEL_DS: SYSCALL uses it.  Unless we
start fiddling with conforming code segments (ugh), I don't think
there's a valid GDT layout that doesn't have two flat data segments.

Oh well, chalk it up to historical accident.

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] HVMLite / PVHv2 - using x86 EFI boot entry

2016-04-12 Thread Andy Lutomirski
On Sun, Apr 10, 2016 at 10:12 PM, Juergen Gross  wrote:
> On 08/04/16 22:40, Luis R. Rodriguez wrote:
>> On Wed, Apr 06, 2016 at 10:40:08AM +0100, David Vrabel wrote:
>>> On 06/04/16 03:40, Luis R. Rodriguez wrote:

 * You don't need full EFI emulation
>>>
>>> I think needing any EFI emulation inside Xen (which is where it would
>>> need to be for dom0) is not suitable because of the increase in
>>> hypervisor ABI.
>>
>> Is this because of timing on architecture / design of HVMLite, or
>> a general position that the complexity to deal with EFI emulation
>> is too much for Xen's taste ?
>
> The Xen hypervisor should be as small as possible. Adding an EFI
> emulator will be adding quite some code. This should be done after a
> very thorough evaluation only.
>
>> ARM already went the EFI entry way for domU -- it went the OVMF route,
>> would such a possibility be possible for x86 domU HVMLite ? If not why
>> not, I mean it would seem to make sense to at least mimic the same type
>> of early boot environment, and perhaps there are some lessons to be
>> learned from that effort too.
>
> The final solution must be appropriate for dom0, too. So don't try
> to limit the discussion to domU. If dom0 isn't going to be acceptable
> there will no need to discuss domU.
>
>> Are there some lessons to be learned with ARM's effort? What are they?
>> If that could be re-done again with any type of cleaner path, what
>> could that be that could help the x86 side ?
>>
>> Although emulating EFI may require work, some folks have pointed out
>> that the amount of work may not be that much. If that is done can
>> we instead rely on the same code to replace OVMF to support both
>> Xen ARM and Xen HVMLite on x86 ? What would be the pros / cons of
>> this ?
>>
>>> I also still do not understand your objection to the current tiny stub.
>>
>> Its more of a hypothetical -- can an EFI entry be used instead given
>> it already does exactly what the new small entry does ? Its also rather
>> odd to add a new entry without evaluating fully a possible alternative
>> that would provide the same exact mechanism.
>
> The interface isn't the new entry only. It should be evaluated how much
> of the early EFI boot path would be common to the HVMlite one. What
> would be gained by using the same entry but having two different boot
> paths after it? You still need a way to distinguish between bare metal
> EFI and HVMlite. And Xen needs a way to find out whether a kernel is
> supporting HVMlite to boot it in the correct mode.
>
>> A full technical unbiased evaluation of the different approaches is what I'd
>> hope we could strive to achieve through discussion and peer review, thinking
>> and prioritizing ultimately what is best to minimize the impact on Linux
>> and also help take advantage of the best features possible through both
>> means. Thinking long term, not immediate short term.
>
> Sure.

FWIW, someone just pointed me to u-boot's EFI implementation.
u-boot's lib/efi_loader contains a tiny (<3k LOC, 10kB compiled) UEFI
implementation that's sufficient to boot a Linux EFI payload.

An argument against making Xen's default domU entry use UEFI is that
it might become unnecessarily awkward to do something like
chainloading to OVMF.   But maybe OVMF can be compiled as a UEFI
binary :)

--Andy

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] xen_exit_mmap() questions

2017-04-26 Thread Andy Lutomirski
I was trying to understand xen_drop_mm_ref() to update it for some
changes I'm working on, and I'm wondering whether we need
xen_exit_mmap() at all.

AFAICS the intent is to force all CPUs to drop their lazy uses of the
mm being destroyed so it can be unpinned before tearing down the page
tables, thus making it faster to tear down the page tables.  This
seems like it'll speed up xen_set_pud() and xen_set_pmd(), but this
seems like it may be of rather limited value.  Could we get away with
deleting it?

Also, this code in drop_other_mm_ref() looks dubious to me:

/* If this cpu still has a stale cr3 reference, then make sure
   it has been flushed. */
if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd))
load_cr3(swapper_pg_dir);

If cr3 hasn't been flushed to the hypervisor because we're in a lazy
mode, why would load_cr3() help?  Shouldn't this be xen_mc_flush()
instead?

Anyway, the whitespace-damaged patch below seems to result in a
fully-functional kernel:

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 37cb5aad71de..e4e073844cbf 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -998,101 +998,6 @@ static void xen_dup_mmap(struct mm_struct
*oldmm, struct mm_struct *mm)
 spin_unlock(&mm->page_table_lock);
 }

-
-#ifdef CONFIG_SMP
-/* Another cpu may still have their %cr3 pointing at the pagetable, so
-   we need to repoint it somewhere else before we can unpin it. */
-static void drop_other_mm_ref(void *info)
-{
-struct mm_struct *mm = info;
-struct mm_struct *active_mm;
-
-active_mm = this_cpu_read(cpu_tlbstate.active_mm);
-
-if (active_mm == mm && this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK)
-leave_mm(smp_processor_id());
-
-/* If this cpu still has a stale cr3 reference, then make sure
-   it has been flushed. */
-if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd))
-load_cr3(swapper_pg_dir);
-}
-
-static void xen_drop_mm_ref(struct mm_struct *mm)
-{
-cpumask_var_t mask;
-unsigned cpu;
-
-if (current->active_mm == mm) {
-if (current->mm == mm)
-load_cr3(swapper_pg_dir);
-else
-leave_mm(smp_processor_id());
-}
-
-/* Get the "official" set of cpus referring to our pagetable. */
-if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
-for_each_online_cpu(cpu) {
-if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
-&& per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
-continue;
-smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
-}
-return;
-}
-cpumask_copy(mask, mm_cpumask(mm));
-
-/* It's possible that a vcpu may have a stale reference to our
-   cr3, because its in lazy mode, and it hasn't yet flushed
-   its set of pending hypercalls yet.  In this case, we can
-   look at its actual current cr3 value, and force it to flush
-   if needed. */
-for_each_online_cpu(cpu) {
-if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
-cpumask_set_cpu(cpu, mask);
-}
-
-if (!cpumask_empty(mask))
-smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
-free_cpumask_var(mask);
-}
-#else
-static void xen_drop_mm_ref(struct mm_struct *mm)
-{
-if (current->active_mm == mm)
-load_cr3(swapper_pg_dir);
-}
-#endif
-
-/*
- * While a process runs, Xen pins its pagetables, which means that the
- * hypervisor forces it to be read-only, and it controls all updates
- * to it.  This means that all pagetable updates have to go via the
- * hypervisor, which is moderately expensive.
- *
- * Since we're pulling the pagetable down, we switch to use init_mm,
- * unpin old process pagetable and mark it all read-write, which
- * allows further operations on it to be simple memory accesses.
- *
- * The only subtle point is that another CPU may be still using the
- * pagetable because of lazy tlb flushing.  This means we need need to
- * switch all CPUs off this pagetable before we can unpin it.
- */
-static void xen_exit_mmap(struct mm_struct *mm)
-{
-get_cpu();/* make sure we don't move around */
-xen_drop_mm_ref(mm);
-put_cpu();
-
-spin_lock(&mm->page_table_lock);
-
-/* pgd may not be pinned in the error exit path of execve */
-if (xen_page_pinned(mm->pgd))
-xen_pgd_unpin(mm);
-
-spin_unlock(&mm->page_table_lock);
-}
-
 static void xen_post_allocator_init(void);

 static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
@@ -1544,6 +1449,8 @@ static int xen_pgd_alloc(struct mm_struct *mm)

 static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
+xen_pgd_unpin(mm);
+
 #ifdef CONFIG_X86_64
 pgd_t *user_pgd = xen_get_user_pgd(pgd);

@@ -2465,7 +2372,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {

 .activate_mm = xen_activate_mm,
 .dup_mmap = xen_dup_mmap,
-.exit_mmap = xen_exit_mmap,

 .lazy_mode = {
 .enter = paravirt_enter_lazy_

Re: [Xen-devel] xen_exit_mmap() questions

2017-04-26 Thread Andy Lutomirski
On Wed, Apr 26, 2017 at 3:45 PM, Boris Ostrovsky
 wrote:
> On 04/26/2017 04:52 PM, Andy Lutomirski wrote:
>> I was trying to understand xen_drop_mm_ref() to update it for some
>> changes I'm working on, and I'm wondering whether we need
>> xen_exit_mmap() at all.
>>
>> AFAICS the intent is to force all CPUs to drop their lazy uses of the
>> mm being destroyed so it can be unpinned before tearing down the page
>> tables, thus making it faster to tear down the page tables.  This
>> seems like it'll speed up xen_set_pud() and xen_set_pmd(), but this
>> seems like it may be of rather limited value.
>
> Why do you think it's of limited value? Without it we will end up with a
> hypercall for each update.
>
> Or is your point that the number of those update is relatively small
> when we are tearing down?

The latter.  Also, unless I'm missing something, xen_set_pte() doesn't
have the optimization.  I haven't looked at exactly how page table
teardown works, but if it clears each PTE individually, then that's
the bulk of the work.

>
>
>>  Could we get away with
>> deleting it?
>>
>> Also, this code in drop_other_mm_ref() looks dubious to me:
>>
>> /* If this cpu still has a stale cr3 reference, then make sure
>>it has been flushed. */
>> if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd))
>> load_cr3(swapper_pg_dir);
>>
>> If cr3 hasn't been flushed to the hypervisor because we're in a lazy
>> mode, why would load_cr3() help?  Shouldn't this be xen_mc_flush()
>> instead?
>
> load_cr3() actually ends with xen_mc_flush() by way of xen_write_cr3()
> -> xen_mc_issue().

xen_mc_issue() does:

if ((paravirt_get_lazy_mode() & mode) == 0)
xen_mc_flush();

I assume the load_cr3() is intended to deal with the case where we're
in lazy mode, but we'll still be in lazy mode, right?  Or does it
serve some other purpose?

--Andy

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] xen_exit_mmap() questions

2017-04-26 Thread Andy Lutomirski
On Wed, Apr 26, 2017 at 5:55 PM, Boris Ostrovsky
 wrote:
>
>
> On 04/26/2017 06:49 PM, Andy Lutomirski wrote:
>>
>> On Wed, Apr 26, 2017 at 3:45 PM, Boris Ostrovsky
>>  wrote:
>>>
>>> On 04/26/2017 04:52 PM, Andy Lutomirski wrote:
>>>>
>>>> I was trying to understand xen_drop_mm_ref() to update it for some
>>>> changes I'm working on, and I'm wondering whether we need
>>>> xen_exit_mmap() at all.
>>>>
>>>> AFAICS the intent is to force all CPUs to drop their lazy uses of the
>>>> mm being destroyed so it can be unpinned before tearing down the page
>>>> tables, thus making it faster to tear down the page tables.  This
>>>> seems like it'll speed up xen_set_pud() and xen_set_pmd(), but this
>>>> seems like it may be of rather limited value.
>>>
>>>
>>> Why do you think it's of limited value? Without it we will end up with a
>>> hypercall for each update.
>>>
>>> Or is your point that the number of those update is relatively small
>>> when we are tearing down?
>>
>>
>> The latter.  Also, unless I'm missing something, xen_set_pte() doesn't
>> have the optimization.  I haven't looked at exactly how page table
>> teardown works, but if it clears each PTE individually, then that's
>> the bulk of the work.
>>
>>>
>>>
>>>>  Could we get away with
>>>> deleting it?
>>>>
>>>> Also, this code in drop_other_mm_ref() looks dubious to me:
>>>>
>>>> /* If this cpu still has a stale cr3 reference, then make sure
>>>>it has been flushed. */
>>>> if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd))
>>>> load_cr3(swapper_pg_dir);
>>>>
>>>> If cr3 hasn't been flushed to the hypervisor because we're in a lazy
>>>> mode, why would load_cr3() help?  Shouldn't this be xen_mc_flush()
>>>> instead?
>>>
>>>
>>> load_cr3() actually ends with xen_mc_flush() by way of xen_write_cr3()
>>> -> xen_mc_issue().
>>
>>
>> xen_mc_issue() does:
>>
>> if ((paravirt_get_lazy_mode() & mode) == 0)
>> xen_mc_flush();
>>
>> I assume the load_cr3() is intended to deal with the case where we're
>> in lazy mode, but we'll still be in lazy mode, right?  Or does it
>> serve some other purpose?
>
>
> Of course. I can't read (I ignored the "== 0" part).
>
> Apparently the early version had an explicit flush but then it disappeared
> (commit 9f79991d4186089e228274196413572cc000143b).
>
> The point of CR3 loading here, I believe, is to make sure the hypervisor
> knows that the (v)CPU is no longer using the the mm's cr3 (we are loading
> swapper_pgdir here).

But that's what leave_mm() does.  To be fair, the x86 lazy TLB
management is a big mess, and this came up because I'm trying to clean
it up without removing it.

I suppose I can try to keep xen_exit_mmap() working.  Is there a
simple way to try to unpin but to not treat it as an error if the
hypervisor rejects it?

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] xen_exit_mmap() questions

2017-04-27 Thread Andy Lutomirski
On Thu, Apr 27, 2017 at 6:21 AM, Boris Ostrovsky
 wrote:
>
>
>> Also, this code in drop_other_mm_ref() looks dubious to me:
>>
>> /* If this cpu still has a stale cr3 reference, then make sure
>>it has been flushed. */
>> if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd))
>> load_cr3(swapper_pg_dir);
>>
>> If cr3 hasn't been flushed to the hypervisor because we're in a lazy
>> mode, why would load_cr3() help?  Shouldn't this be xen_mc_flush()
>> instead?
>
> load_cr3() actually ends with xen_mc_flush() by way of xen_write_cr3()
> -> xen_mc_issue().

 xen_mc_issue() does:

 if ((paravirt_get_lazy_mode() & mode) == 0)
 xen_mc_flush();

 I assume the load_cr3() is intended to deal with the case where we're
 in lazy mode, but we'll still be in lazy mode, right?  Or does it
 serve some other purpose?
>>>
>>> Of course. I can't read (I ignored the "== 0" part).
>>>
>>> Apparently the early version had an explicit flush but then it disappeared
>>> (commit 9f79991d4186089e228274196413572cc000143b).
>>>
>>> The point of CR3 loading here, I believe, is to make sure the hypervisor
>>> knows that the (v)CPU is no longer using the the mm's cr3 (we are loading
>>> swapper_pgdir here).
>> But that's what leave_mm() does.  To be fair, the x86 lazy TLB
>> management is a big mess, and this came up because I'm trying to clean
>> it up without removing it.
>
> True. I don't know though if you can guarantee that leave_mm() (or
> load_cr3() inside it) is actually called if we are in lazy mode.

The code just before that makes these calls.

Anyway, I propose to rewrite the whole thing like this:

https://git.kernel.org/pub/scm/linux/kernel/git/luto/linux.git/commit/?h=x86/tlbflush_cleanup&id=ff143a54bb3bafaaad6e32145a9cfbc112e8584f

>
>>
>> I suppose I can try to keep xen_exit_mmap() working.  Is there a
>> simple way to try to unpin but to not treat it as an error if the
>> hypervisor rejects it?
>
> Even if we managed to craft a call in Linux to do this (current
> xen_pgd_unpin() will result in a WARNing in xen_mc_flush()) this will
> still cause a bunch of warnings in the hypervisor (if it is built as
> DEBUG, but bad nevertheless).
>
> But even without that, it is an error for a reason so how are you
> planning to continue if you ignore it?
>

I was imagining that we'd just try to unpin and carry on if it fails.
We can always unpin later in xen_pgd_free().

--Andy

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] xen_exit_mmap() questions

2017-04-27 Thread Andy Lutomirski
On Thu, Apr 27, 2017 at 1:00 PM, Boris Ostrovsky
 wrote:
> On 04/27/2017 12:46 PM, Andy Lutomirski wrote:
>> On Thu, Apr 27, 2017 at 6:21 AM, Boris Ostrovsky
>>  wrote:
>>>>>>>> Also, this code in drop_other_mm_ref() looks dubious to me:
>>>>>>>>
>>>>>>>> /* If this cpu still has a stale cr3 reference, then make sure
>>>>>>>>it has been flushed. */
>>>>>>>> if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd))
>>>>>>>> load_cr3(swapper_pg_dir);
>>>>>>>>
>>>>>>>> If cr3 hasn't been flushed to the hypervisor because we're in a lazy
>>>>>>>> mode, why would load_cr3() help?  Shouldn't this be xen_mc_flush()
>>>>>>>> instead?
>>>>>>> load_cr3() actually ends with xen_mc_flush() by way of xen_write_cr3()
>>>>>>> -> xen_mc_issue().
>>>>>> xen_mc_issue() does:
>>>>>>
>>>>>> if ((paravirt_get_lazy_mode() & mode) == 0)
>>>>>> xen_mc_flush();
>>>>>>
>>>>>> I assume the load_cr3() is intended to deal with the case where we're
>>>>>> in lazy mode, but we'll still be in lazy mode, right?  Or does it
>>>>>> serve some other purpose?
>>>>> Of course. I can't read (I ignored the "== 0" part).
>>>>>
>>>>> Apparently the early version had an explicit flush but then it disappeared
>>>>> (commit 9f79991d4186089e228274196413572cc000143b).
>>>>>
>>>>> The point of CR3 loading here, I believe, is to make sure the hypervisor
>>>>> knows that the (v)CPU is no longer using the the mm's cr3 (we are loading
>>>>> swapper_pgdir here).
>>>> But that's what leave_mm() does.  To be fair, the x86 lazy TLB
>>>> management is a big mess, and this came up because I'm trying to clean
>>>> it up without removing it.
>>> True. I don't know though if you can guarantee that leave_mm() (or
>>> load_cr3() inside it) is actually called if we are in lazy mode.
>> The code just before that makes these calls.
>
> Yes, and I was unsure whether we always get to make these calls, based
> on mm and cpu_tlbstate. I think we do and with your changes it is made
> even more clear.

:)

>
>>
>> Anyway, I propose to rewrite the whole thing like this:
>>
>> https://git.kernel.org/pub/scm/linux/kernel/git/luto/linux.git/commit/?h=x86/tlbflush_cleanup&id=ff143a54bb3bafaaad6e32145a9cfbc112e8584f
>
> Can you explain xen_pgd_free() change? When do you expect
> xen_exit_mmap() to fail unpinning (compared to what we have now)?

I don't expect it to fail, but I made fairly extensive changes.

--Andy

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [PATCH] x86/xen: Improve failed hypercall debugging

2017-05-02 Thread Andy Lutomirski
When fiddling with xen_exit_mmap(), I noticed that failed multicall
debugging doesn't work if the multicall is just one call.  Fix it.

Cc: Juergen Gross 
Cc: Konrad Rzeszutek Wilk 
Cc: Boris Ostrovsky 
Signed-off-by: Andy Lutomirski 
---
 arch/x86/xen/multicalls.c | 26 +-
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index ea54a08d8301..b6b3f024d342 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -96,23 +96,23 @@ void xen_mc_flush(void)
for (i = 0; i < b->mcidx; i++)
if (b->entries[i].result < 0)
ret++;
+   }
 
 #if MC_DEBUG
-   if (ret) {
-   printk(KERN_ERR "%d multicall(s) failed: cpu %d\n",
-  ret, smp_processor_id());
-   dump_stack();
-   for (i = 0; i < b->mcidx; i++) {
-   printk(KERN_DEBUG "  call %2d/%d: op=%lu 
arg=[%lx] result=%ld\t%pF\n",
-  i+1, b->mcidx,
-  b->debug[i].op,
-  b->debug[i].args[0],
-  b->entries[i].result,
-  b->caller[i]);
-   }
+   if (ret) {
+   printk(KERN_ERR "%d multicall(s) failed: cpu %d\n",
+  ret, smp_processor_id());
+   dump_stack();
+   for (i = 0; i < b->mcidx; i++) {
+   printk(KERN_DEBUG "  call %2d/%d: op=%lu arg=[%lx] 
result=%ld\t%pF\n",
+  i+1, b->mcidx,
+  b->debug[i].op,
+  b->debug[i].args[0],
+  b->entries[i].result,
+  b->caller[i]);
}
-#endif
}
+#endif
 
b->mcidx = 0;
b->argidx = 0;
-- 
2.9.3


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [PATCH] x86/mm: Split read_cr3() into read_cr3_pa() and __read_cr3()

2017-06-12 Thread Andy Lutomirski
The kernel has several code paths that read CR3.  Most of them assume that
CR3 contains the PGD's physical address, whereas some of them awkwardly
use PHYSICAL_PAGE_MASK to mask off low bits.

Add explicit mask macros for CR3 and convert all of the CR3 readers.
This will keep them from breaking when PCID is enabled.

Cc: Tom Lendacky 
Cc: Juergen Gross 
Cc: xen-devel 
Cc: Boris Ostrovsky 
Signed-off-by: Andy Lutomirski 
---

Hi Ingo-

I broke this out because Tom's SME series and my PCID series both need it.
Please consider applying it to tip:x86/mm.

I'll send PCID v2 soon.  It'll apply to x86/mm + sched/urgent + this patch.

Thanks,
Andy

 arch/x86/boot/compressed/pagetable.c   |  2 +-
 arch/x86/include/asm/efi.h |  2 +-
 arch/x86/include/asm/mmu_context.h |  4 ++--
 arch/x86/include/asm/paravirt.h|  2 +-
 arch/x86/include/asm/processor-flags.h | 36 ++
 arch/x86/include/asm/processor.h   |  8 
 arch/x86/include/asm/special_insns.h   | 10 +++---
 arch/x86/include/asm/tlbflush.h|  4 ++--
 arch/x86/kernel/head64.c   |  3 ++-
 arch/x86/kernel/paravirt.c |  2 +-
 arch/x86/kernel/process_32.c   |  2 +-
 arch/x86/kernel/process_64.c   |  2 +-
 arch/x86/kvm/vmx.c |  2 +-
 arch/x86/mm/fault.c| 10 +-
 arch/x86/mm/ioremap.c  |  2 +-
 arch/x86/platform/efi/efi_64.c |  4 ++--
 arch/x86/platform/olpc/olpc-xo1-pm.c   |  2 +-
 arch/x86/power/cpu.c   |  2 +-
 arch/x86/power/hibernate_64.c  |  3 ++-
 arch/x86/xen/mmu_pv.c  |  6 +++---
 20 files changed, 79 insertions(+), 29 deletions(-)

diff --git a/arch/x86/boot/compressed/pagetable.c 
b/arch/x86/boot/compressed/pagetable.c
index 1d78f1739087..8e69df96492e 100644
--- a/arch/x86/boot/compressed/pagetable.c
+++ b/arch/x86/boot/compressed/pagetable.c
@@ -92,7 +92,7 @@ void initialize_identity_maps(void)
 * and we must append to the existing area instead of entirely
 * overwriting it.
 */
-   level4p = read_cr3();
+   level4p = read_cr3_pa();
if (level4p == (unsigned long)_pgtable) {
debug_putstr("booted via startup_32()\n");
pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE;
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 2f77bcefe6b4..d2ff779f347e 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -74,7 +74,7 @@ struct efi_scratch {
__kernel_fpu_begin();   \
\
if (efi_scratch.use_pgd) {  \
-   efi_scratch.prev_cr3 = read_cr3();  \
+   efi_scratch.prev_cr3 = __read_cr3();\
write_cr3((unsigned long)efi_scratch.efi_pgt);  \
__flush_tlb_all();  \
}   \
diff --git a/arch/x86/include/asm/mmu_context.h 
b/arch/x86/include/asm/mmu_context.h
index 5a93f6261302..cfe6034ebfc6 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -269,7 +269,7 @@ static inline bool arch_vma_access_permitted(struct 
vm_area_struct *vma,
 
 /*
  * This can be used from process context to figure out what the value of
- * CR3 is without needing to do a (slow) read_cr3().
+ * CR3 is without needing to do a (slow) __read_cr3().
  *
  * It's intended to be used for code like KVM that sneakily changes CR3
  * and needs to restore it.  It needs to be used very carefully.
@@ -281,7 +281,7 @@ static inline unsigned long __get_current_cr3_fast(void)
/* For now, be very restrictive about when this can be called. */
VM_WARN_ON(in_nmi() || !in_atomic());
 
-   VM_BUG_ON(cr3 != read_cr3());
+   VM_BUG_ON(cr3 != __read_cr3());
return cr3;
 }
 
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 9a15739d9f4b..a63e77f8eb41 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -61,7 +61,7 @@ static inline void write_cr2(unsigned long x)
PVOP_VCALL1(pv_mmu_ops.write_cr2, x);
 }
 
-static inline unsigned long read_cr3(void)
+static inline unsigned long __read_cr3(void)
 {
return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr3);
 }
diff --git a/arch/x86/include/asm/processor-flags.h 
b/arch/x86/include/asm/processor-flags.h
index 39fb618e2211..79aa2f98398d 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -8,4 +8,40 @@
 #else
 #define X86_VM_MASK0 /* No VM86 support */
 #endif
+
+/*
+ * CR3's layout varies depending on several things.
+ *
+ * If CR4.PCIDE is set (64-

Re: [Xen-devel] [PATCH] x86/mm: Split read_cr3() into read_cr3_pa() and __read_cr3()

2017-06-13 Thread Andy Lutomirski
On Tue, Jun 13, 2017 at 2:26 AM, Borislav Petkov  wrote:
> On Mon, Jun 12, 2017 at 10:26:14AM -0700, Andy Lutomirski wrote:
>> The kernel has several code paths that read CR3.  Most of them assume that
>> CR3 contains the PGD's physical address, whereas some of them awkwardly
>> use PHYSICAL_PAGE_MASK to mask off low bits.
>>
>> Add explicit mask macros for CR3 and convert all of the CR3 readers.
>> This will keep them from breaking when PCID is enabled.
>
> ...
>
>> +/*
>> + * CR3's layout varies depending on several things.
>> + *
>> + * If CR4.PCIDE is set (64-bit only), then CR3[11:0] is the address space 
>> ID.
>> + * If PAE is enabled, then CR3[11:5] is part of the PDPT address
>> + * (i.e. it's 32-byte aligned, not page-aligned) and CR3[4:0] is ignored.
>> + * Otherwise (non-PAE, non-PCID), CR3[3] is PWT, CR3[4] is PCD, and
>> + * CR3[2:0] and CR3[11:5] are ignored.
>> + *
>> + * In all cases, Linux puts zeros in the low ignored bits and in PWT and 
>> PCD.
>> + *
>> + * CR3[63] is always read as zero.  If CR4.PCIDE is set, then CR3[63] may be
>> + * written as 1 to prevent the write to CR3 from flushing the TLB.
>> + *
>> + * On systems with SME, one bit (in a variable position!) is stolen to 
>> indicate
>> + * that the top-level paging structure is encrypted.
>> + *
>> + * All of the remaining bits indicate the physical address of the top-level
>> + * paging structure.
>> + *
>> + * CR3_ADDR_MASK is the mask used by read_cr3_pa().
>> + */
>> +#ifdef CONFIG_X86_64
>> +/* Mask off the address space ID bits. */
>> +#define CR3_ADDR_MASK 0x7000ull
>> +#define CR3_PCID_MASK 0xFFFull
>> +#else
>> +/*
>> + * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
>> + * a tiny bit of code size by setting all the bits.
>> + */
>> +#define CR3_ADDR_MASK 0xull
>> +#define CR3_PCID_MASK 0ull
>
> All those can do GENMASK_ULL for better readability.
>

Hmm.  I'll look at that.

>> diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
>> index 3586996fc50d..bc0a849589bb 100644
>> --- a/arch/x86/kernel/paravirt.c
>> +++ b/arch/x86/kernel/paravirt.c
>> @@ -391,7 +391,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
>>
>>   .read_cr2 = native_read_cr2,
>>   .write_cr2 = native_write_cr2,
>> - .read_cr3 = native_read_cr3,
>> + .read_cr3 = __native_read_cr3,
>>   .write_cr3 = native_write_cr3,
>>
>>   .flush_tlb_user = native_flush_tlb,
>> diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
>> index ffeae818aa7a..c6d6dc5f8bb2 100644
>> --- a/arch/x86/kernel/process_32.c
>> +++ b/arch/x86/kernel/process_32.c
>> @@ -92,7 +92,7 @@ void __show_regs(struct pt_regs *regs, int all)
>>
>>   cr0 = read_cr0();
>>   cr2 = read_cr2();
>> - cr3 = read_cr3();
>> + cr3 = __read_cr3();
>>   cr4 = __read_cr4();
>
> This is a good example for my confusion. So we have __read_cr4() - with
> the underscores - but not read_cr4().
>
> Now we get __read_cr3 *and* read_cr3_pa(). So __read_cr3() could just as
> well lose the "__", right?
>
> Or are the PCID series bringing a read_cr3() without "__" too?

The intent was twofold:

1. Make sure that every read_cr3() instance got converted.  I didn't
want a mid-air collision with someone else's patch in which it would
appear to apply and compile but the result would randomly fail on PCID
systems.

2. Make users realize that CR3 ain't what it used to be.  __read_cr3()
means "return this complicated register value -- I know what I'm
doing" and read_cr3_pa() means "give me the PA".

Maybe we could rename __read_cr3() to read_cr3_raw()?  If we really
wanted lots of clarity, __read_cr4() could become read_cr4_noshadow(),
I suppose.

What do you think?  My general preference is to clean this up after
the rest of the big patchsets (SME and PCID) land.

>
> Oh, and to confuse me even more, there's __native_read_cr3() which is
> *finally* accessing %cr3 :-) But there's native_write_cr3() without
> underscores.
>
> So can we make those names a bit more balanced please?

write_cr3() was less widespread, so I worried about it less.

--Andy

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v1 06/27] x86/entry/64: Adapt assembly for PIE support

2017-10-20 Thread Andy Lutomirski


> On Oct 20, 2017, at 5:20 PM, Ingo Molnar  wrote:
> 
> 
> * Thomas Garnier  wrote:
> 
   */
 - cmpq$.Lentry_SYSCALL_64_after_fastpath_call, (%rsp)
 + leaq.Lentry_SYSCALL_64_after_fastpath_call(%rip), %r11
 + cmpq%r11, (%rsp)
  jne 1f
> 
>>> This patch seems to add extra overhead to the syscall fast-path even when 
>>> PIE is
>>> disabled, right?
>> 
>> It does add extra instructions when one is not possible, I preferred
>> that over ifdefing but I can change it.
> 
> So my problem is, this pattern repeats in many other places as well, but 
> sprinking 
> various pieces of assembly code with #ifdefs would be very bad as well.
> 
> I have no good idea how to solve this.
> 

How about:

.macro JMP_TO_LABEL ...


> Thanks,
> 
>Ingo

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v1 06/27] x86/entry/64: Adapt assembly for PIE support

2017-10-20 Thread Andy Lutomirski


> On Oct 20, 2017, at 5:20 PM, Ingo Molnar  wrote:
> 
> 
> * Thomas Garnier  wrote:
> 
   */
 - cmpq$.Lentry_SYSCALL_64_after_fastpath_call, (%rsp)
 + leaq.Lentry_SYSCALL_64_after_fastpath_call(%rip), %r11
 + cmpq%r11, (%rsp)
  jne 1f
> 
>>> This patch seems to add extra overhead to the syscall fast-path even when 
>>> PIE is
>>> disabled, right?
>> 
>> It does add extra instructions when one is not possible, I preferred
>> that over ifdefing but I can change it.
> 
> So my problem is, this pattern repeats in many other places as well, but 
> sprinking 
> various pieces of assembly code with #ifdefs would be very bad as well.
> 
> I have no good idea how to solve this.
> 
> Thanks,

Ugh, brain was off.  This is a bit messy. We could use a macro for this, too, I 
suppose.

> 
>Ingo

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] Xen PV seems to be broken on Linus' tree

2017-11-21 Thread Andy Lutomirski
I'm doing:

/usr/bin/qemu-system-x86_64 -machine accel=kvm:tcg -cpu host -net none
-nographic -kernel xen-4.8.2 -initrd './arch/x86/boot/bzImage' -m 2G
-smp 2 -append console=com1

With Linus' commit c8a0739b185d11d6e2ca7ad9f5835841d1cfc765 and the
attached config.

It dies with a bunch of sensible log lines and then:

(XEN) d0v0 Unhandled invalid opcode fault/trap [#6, ec=]
(XEN) domain_crash_sync called from entry.S: fault at 82d08023961a
entry.o#create_bounce_frame+0x137/0x146
(XEN) Domain 0 (vcpu#0) crashed on cpu#0:
(XEN) [ Xen-4.8.2  x86_64  debug=n   Not tainted ]
(XEN) CPU:0
(XEN) RIP:e033:[]
(XEN) RFLAGS: 0296   EM: 1   CONTEXT: pv guest (d0v0)
(XEN) rax: 002f   rbx: 81e65a48   rcx: 81e71288
(XEN) rdx: 81e27500   rsi: 0001   rdi: 81133f88
(XEN) rbp:    rsp: 81e03e78   r8:  
(XEN) r9:  0001   r10:    r11: 
(XEN) r12:    r13: 0001   r14: 0001
(XEN) r15:    cr0: 8005003b   cr4: 003506e0
(XEN) cr3: 7b0b3000   cr2: 
(XEN) ds:    es:    fs:    gs:    ss: e02b   cs: e033
(XEN) Guest stack trace from rsp=81e03e78:
(XEN)81e71288  811226eb 0001e030
(XEN)00010096 81e03eb8 e02b 811226eb
(XEN)81122c2e 0200  
(XEN)0030 81c69cf5 81080b20 81080560
(XEN) 810d3741 8107b420 81094660

Is this familiar?

I'll feel really dumb if it ends up being my fault.


.config
Description: Binary data
___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] Xen PV seems to be broken on Linus' tree

2017-11-21 Thread Andy Lutomirski
On Tue, Nov 21, 2017 at 7:33 PM, Andy Lutomirski  wrote:
> I'm doing:
>
> /usr/bin/qemu-system-x86_64 -machine accel=kvm:tcg -cpu host -net none
> -nographic -kernel xen-4.8.2 -initrd './arch/x86/boot/bzImage' -m 2G
> -smp 2 -append console=com1
>
> With Linus' commit c8a0739b185d11d6e2ca7ad9f5835841d1cfc765 and the
> attached config.
>
> It dies with a bunch of sensible log lines and then:
>
> (XEN) d0v0 Unhandled invalid opcode fault/trap [#6, ec=]
> (XEN) domain_crash_sync called from entry.S: fault at 82d08023961a
> entry.o#create_bounce_frame+0x137/0x146
> (XEN) Domain 0 (vcpu#0) crashed on cpu#0:
> (XEN) [ Xen-4.8.2  x86_64  debug=n   Not tainted ]
> (XEN) CPU:0
> (XEN) RIP:e033:[]
> (XEN) RFLAGS: 0296   EM: 1   CONTEXT: pv guest (d0v0)
> (XEN) rax: 002f   rbx: 81e65a48   rcx: 81e71288
> (XEN) rdx: 81e27500   rsi: 0001   rdi: 81133f88
> (XEN) rbp:    rsp: 81e03e78   r8:  
> (XEN) r9:  0001   r10:    r11: 
> (XEN) r12:    r13: 0001   r14: 0001
> (XEN) r15:    cr0: 8005003b   cr4: 003506e0
> (XEN) cr3: 7b0b3000   cr2: 
> (XEN) ds:    es:    fs:    gs:    ss: e02b   cs: e033
> (XEN) Guest stack trace from rsp=81e03e78:
> (XEN)81e71288  811226eb 0001e030
> (XEN)00010096 81e03eb8 e02b 811226eb
> (XEN)81122c2e 0200  
> (XEN)0030 81c69cf5 81080b20 81080560
> (XEN) 810d3741 8107b420 81094660
>
> Is this familiar?
>
> I'll feel really dumb if it ends up being my fault.

Nah, it's broken at least back to v4.13, and I suspect it's config
related.  objdump gives me this:

8112b0e1:   e9 e8 fe ff ff  jmpq
8112afce 
8112b0e6:   48 c7 c6 2d f8 c8 81mov$0x81c8f82d,%rsi
8112b0ed:   48 c7 c7 58 b9 c8 81mov$0x81c8b958,%rdi
8112b0f4:   e8 13 2d 01 00  callq  8113de0c 
8112b0f9:   0f ff   (bad)   <-- crash here

That's "ud0", which is used by WARN.  So we're probably hitting an
early warning and Xen probably has something busted with early
exception handling.

Anyone want to debug it and fix it?

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] Xen PV seems to be broken on Linus' tree

2017-11-21 Thread Andy Lutomirski
On Tue, Nov 21, 2017 at 8:11 PM, Andy Lutomirski  wrote:
> On Tue, Nov 21, 2017 at 7:33 PM, Andy Lutomirski  wrote:
>> I'm doing:
>>
>> /usr/bin/qemu-system-x86_64 -machine accel=kvm:tcg -cpu host -net none
>> -nographic -kernel xen-4.8.2 -initrd './arch/x86/boot/bzImage' -m 2G
>> -smp 2 -append console=com1
>>
>> With Linus' commit c8a0739b185d11d6e2ca7ad9f5835841d1cfc765 and the
>> attached config.
>>
>> It dies with a bunch of sensible log lines and then:
>>
>> (XEN) d0v0 Unhandled invalid opcode fault/trap [#6, ec=]
>> (XEN) domain_crash_sync called from entry.S: fault at 82d08023961a
>> entry.o#create_bounce_frame+0x137/0x146
>> (XEN) Domain 0 (vcpu#0) crashed on cpu#0:
>> (XEN) [ Xen-4.8.2  x86_64  debug=n   Not tainted ]
>> (XEN) CPU:0
>> (XEN) RIP:e033:[]
>> (XEN) RFLAGS: 0296   EM: 1   CONTEXT: pv guest (d0v0)
>> (XEN) rax: 002f   rbx: 81e65a48   rcx: 81e71288
>> (XEN) rdx: 81e27500   rsi: 0001   rdi: 81133f88
>> (XEN) rbp:    rsp: 81e03e78   r8:  
>> (XEN) r9:  0001   r10:    r11: 
>> (XEN) r12:    r13: 0001   r14: 0001
>> (XEN) r15:    cr0: 8005003b   cr4: 003506e0
>> (XEN) cr3: 7b0b3000   cr2: 
>> (XEN) ds:    es:    fs:    gs:    ss: e02b   cs: e033
>> (XEN) Guest stack trace from rsp=81e03e78:
>> (XEN)81e71288  811226eb 0001e030
>> (XEN)00010096 81e03eb8 e02b 811226eb
>> (XEN)81122c2e 0200  
>> (XEN)0030 81c69cf5 81080b20 81080560
>> (XEN) 810d3741 8107b420 81094660
>>
>> Is this familiar?
>>
>> I'll feel really dumb if it ends up being my fault.
>
> Nah, it's broken at least back to v4.13, and I suspect it's config
> related.  objdump gives me this:
>
> 8112b0e1:   e9 e8 fe ff ff  jmpq
> 8112afce 
> 8112b0e6:   48 c7 c6 2d f8 c8 81mov
> $0x81c8f82d,%rsi
> 8112b0ed:   48 c7 c7 58 b9 c8 81mov
> $0x81c8b958,%rdi
> 8112b0f4:   e8 13 2d 01 00  callq  8113de0c 
> 
> 8112b0f9:   0f ff   (bad)   <-- crash here
>
> That's "ud0", which is used by WARN.  So we're probably hitting an
> early warning and Xen probably has something busted with early
> exception handling.
>
> Anyone want to debug it and fix it?

Well, I think I debugged it.  x86_64 has a shiny function
idt_setup_early_handler(), and Xen doesn't call it.  Fixing the
problem may be as simple as calling it at an appropriate time and
doing whatever asm magic is needed to deal with Xen's weird IDT
calling convention.

--Andy

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] Xen PV seems to be broken on Linus' tree

2017-11-22 Thread Andy Lutomirski
On Wed, Nov 22, 2017 at 4:50 AM, Juergen Gross  wrote:
> On 22/11/17 05:46, Andy Lutomirski wrote:
>> On Tue, Nov 21, 2017 at 8:11 PM, Andy Lutomirski  wrote:
>>> On Tue, Nov 21, 2017 at 7:33 PM, Andy Lutomirski  wrote:
>>>> I'm doing:
>>>>
>>>> /usr/bin/qemu-system-x86_64 -machine accel=kvm:tcg -cpu host -net none
>>>> -nographic -kernel xen-4.8.2 -initrd './arch/x86/boot/bzImage' -m 2G
>>>> -smp 2 -append console=com1
>>>>
>>>> With Linus' commit c8a0739b185d11d6e2ca7ad9f5835841d1cfc765 and the
>>>> attached config.
>>>>
>>>> It dies with a bunch of sensible log lines and then:
>>>>
>>>> (XEN) d0v0 Unhandled invalid opcode fault/trap [#6, ec=]
>>>> (XEN) domain_crash_sync called from entry.S: fault at 82d08023961a
>>>> entry.o#create_bounce_frame+0x137/0x146
>>>> (XEN) Domain 0 (vcpu#0) crashed on cpu#0:
>>>> (XEN) [ Xen-4.8.2  x86_64  debug=n   Not tainted ]
>>>> (XEN) CPU:0
>>>> (XEN) RIP:e033:[]
>>>> (XEN) RFLAGS: 0296   EM: 1   CONTEXT: pv guest (d0v0)
>>>> (XEN) rax: 002f   rbx: 81e65a48   rcx: 81e71288
>>>> (XEN) rdx: 81e27500   rsi: 0001   rdi: 81133f88
>>>> (XEN) rbp:    rsp: 81e03e78   r8:  
>>>> (XEN) r9:  0001   r10:    r11: 
>>>> (XEN) r12:    r13: 0001   r14: 0001
>>>> (XEN) r15:    cr0: 8005003b   cr4: 003506e0
>>>> (XEN) cr3: 7b0b3000   cr2: 
>>>> (XEN) ds:    es:    fs:    gs:    ss: e02b   cs: e033
>>>> (XEN) Guest stack trace from rsp=81e03e78:
>>>> (XEN)81e71288  811226eb 
>>>> 0001e030
>>>> (XEN)00010096 81e03eb8 e02b 
>>>> 811226eb
>>>> (XEN)81122c2e 0200  
>>>> 
>>>> (XEN)0030 81c69cf5 81080b20 
>>>> 81080560
>>>> (XEN) 810d3741 8107b420 
>>>> 81094660
>>>>
>>>> Is this familiar?
>>>>
>>>> I'll feel really dumb if it ends up being my fault.
>>>
>>> Nah, it's broken at least back to v4.13, and I suspect it's config
>>> related.  objdump gives me this:
>>>
>>> 8112b0e1:   e9 e8 fe ff ff  jmpq
>>> 8112afce 
>>> 8112b0e6:   48 c7 c6 2d f8 c8 81mov
>>> $0x81c8f82d,%rsi
>>> 8112b0ed:   48 c7 c7 58 b9 c8 81mov
>>> $0x81c8b958,%rdi
>>> 8112b0f4:   e8 13 2d 01 00  callq  8113de0c 
>>> 
>>> 8112b0f9:   0f ff   (bad)   <-- crash here
>>>
>>> That's "ud0", which is used by WARN.  So we're probably hitting an
>>> early warning and Xen probably has something busted with early
>>> exception handling.
>>>
>>> Anyone want to debug it and fix it?
>>
>> Well, I think I debugged it.  x86_64 has a shiny function
>> idt_setup_early_handler(), and Xen doesn't call it.  Fixing the
>> problem may be as simple as calling it at an appropriate time and
>> doing whatever asm magic is needed to deal with Xen's weird IDT
>> calling convention.
>
> Hmm, yes, this should work. I'll have a try.
>
> BTW: I don't think this ever worked.
>

The ud0 trick itself is fairly recent, so old enough kernels (4.10?  I
don't really remember) wouldn't die just because of an early warning.

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v1] xen: get rid of paravirt op adjust_exception_frame

2017-07-26 Thread Andy Lutomirski
On Mon, Jul 24, 2017 at 7:28 AM, Juergen Gross  wrote:
> When running as Xen pv-guest the exception frame on the stack contains
> %r11 and %rcx additional to the other data pushed by the processor.
>
> Instead of having a paravirt op being called for each exception type
> prepend the Xen specific code to each exception entry. When running as
> Xen pv-guest just use the exception entry with prepended instructions,
> otherwise use the entry without the Xen specific code.

I think this is a nice cleanup, but I'm wondering if it would be even
nicer if the Xen part was kept out-of-line.  That is, could Xen have
little stubs like:

xen_alignment_check:
  pop %rcx
  pop %r11
  jmp alignment_check

rather than using the macros in entry_64.S that you have?  Then you
could adjust set_trap_gate instead of pack_gate and maybe even do
something like:

#define set_trap_gate(..., name, ...) set_native_or_xen_trap_gate(...,
name, xen_##name, ...)

>  /* Runs on exception stack */
> -ENTRY(nmi)
> -   /*
> -* Fix up the exception frame if we're on Xen.
> -* PARAVIRT_ADJUST_EXCEPTION_FRAME is guaranteed to push at most
> -* one value to the stack on native, so it may clobber the rdx
> -* scratch slot, but it won't clobber any of the important
> -* slots past it.
> -*
> -* Xen is a different story, because the Xen frame itself overlaps
> -* the "NMI executing" variable.
> -*/

I would keep this comment.  The Xen frame really is in the way AFAICT.


--Andy

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v1] xen: get rid of paravirt op adjust_exception_frame

2017-07-26 Thread Andy Lutomirski
On Wed, Jul 26, 2017 at 7:01 AM, Andrew Cooper
 wrote:
> On 26/07/17 14:48, Andy Lutomirski wrote:
>>
>>>  /* Runs on exception stack */
>>> -ENTRY(nmi)
>>> -   /*
>>> -* Fix up the exception frame if we're on Xen.
>>> -* PARAVIRT_ADJUST_EXCEPTION_FRAME is guaranteed to push at most
>>> -* one value to the stack on native, so it may clobber the rdx
>>> -* scratch slot, but it won't clobber any of the important
>>> -* slots past it.
>>> -*
>>> -* Xen is a different story, because the Xen frame itself overlaps
>>> -* the "NMI executing" variable.
>>> -*/
>> I would keep this comment.  The Xen frame really is in the way AFAICT.
>
> (For reasons best explained by the original authors) there is only ever
> a single stack which a PV guest registers with Xen, which functions
> equivalently to tss.sp0.  There is no support for stack switching via
> task switch or IST.
>
> Therefore, nested NMIs won't clobber the top of this stack.
>

Does that mean that nested NMIs on Xen just nest normally without
clobbering each other?  If so, that's neat, although I wonder how we
don't get crashes due to this:

/* Normal 64-bit system call target */
ENTRY(xen_syscall_target)
undo_xen_syscall
<-- NMI right here
jmp entry_SYSCALL_64_after_swapgs
ENDPROC(xen_syscall_target)

I think it would be nice if Xen could instead enter the native syscall
path a bit later like this:

ENTRY(entry_SYSCALL_64)
/*
 * Interrupts are off on entry.
 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
 * it is too small to ever cause noticeable irq latency.
 */
SWAPGS_UNSAFE_STACK
/*
 * A hypervisor implementation might want to use a label
 * after the swapgs, so that it can do the swapgs
 * for the guest and jump here on syscall.
 */
GLOBAL(entry_SYSCALL_64_after_swapgs)

movq%rsp, PER_CPU_VAR(rsp_scratch)
movqPER_CPU_VAR(cpu_current_top_of_stack), %rsp

TRACE_IRQS_OFF

/* Construct struct pt_regs on stack */
pushq   $__USER_DS  /* pt_regs->ss */
pushq   PER_CPU_VAR(rsp_scratch)/* pt_regs->sp */
pushq   %r11/* pt_regs->flags */
pushq   $__USER_CS  /* pt_regs->cs */
pushq   %rcx/* pt_regs->ip */

<-- Xen enters here

then we wouldn't have all this funny business.  And Xen could
completely skip the nmi nesting code.

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v1] xen: get rid of paravirt op adjust_exception_frame

2017-07-26 Thread Andy Lutomirski


> On Jul 26, 2017, at 11:50 AM, Juergen Gross  wrote:
> 
>> On 26/07/17 15:48, Andy Lutomirski wrote:
>>> On Mon, Jul 24, 2017 at 7:28 AM, Juergen Gross  wrote:
>>> When running as Xen pv-guest the exception frame on the stack contains
>>> %r11 and %rcx additional to the other data pushed by the processor.
>>> 
>>> Instead of having a paravirt op being called for each exception type
>>> prepend the Xen specific code to each exception entry. When running as
>>> Xen pv-guest just use the exception entry with prepended instructions,
>>> otherwise use the entry without the Xen specific code.
>> 
>> I think this is a nice cleanup, but I'm wondering if it would be even
>> nicer if the Xen part was kept out-of-line.  That is, could Xen have
>> little stubs like:
>> 
>> xen_alignment_check:
>>  pop %rcx
>>  pop %r11
>>  jmp alignment_check
>> 
>> rather than using the macros in entry_64.S that you have?  Then you
>> could adjust set_trap_gate instead of pack_gate and maybe even do
>> something like:
>> 
>> #define set_trap_gate(..., name, ...) set_native_or_xen_trap_gate(...,
>> name, xen_##name, ...)
> 
> I think I'll have something like:
> 
> #define pv_trap_entry(name) (xen_pv_domain() ? xen_ ## name : name)
> 
> and use it like:
> 
> set_intr_gate(X86_TRAP_AC, pv_trap_entry(alignment_check));
> 
> This will avoid having to define macros for all variants of
> set_intr_gate(), e.g. set_intr_gate_ist(), set_system_intr_gate().
> 
> Do you have any objections?
> 

Sounds good to me.

FWIW, I have no real objection to putting the Xen entry right before the native 
entry and falling through.  I don't love the ip -= 3 bit, though, and I think 
that the PV_ENTRY macro is too magical.

This might be okay, though:

XEN_PV_ENTRY_FALLTHROUGH(foo)
ENTRY(foo)
  code here



> 
> Juergen

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v1] xen: get rid of paravirt op adjust_exception_frame

2017-07-26 Thread Andy Lutomirski


> On Jul 26, 2017, at 2:43 PM, Juergen Gross  wrote:
> 
>> On 26/07/17 19:57, Andy Lutomirski wrote:
>> 
>> 
>>>> On Jul 26, 2017, at 11:50 AM, Juergen Gross  wrote:
>>>> 
>>>>> On 26/07/17 15:48, Andy Lutomirski wrote:
>>>>> On Mon, Jul 24, 2017 at 7:28 AM, Juergen Gross  wrote:
>>>>> When running as Xen pv-guest the exception frame on the stack contains
>>>>> %r11 and %rcx additional to the other data pushed by the processor.
>>>>> 
>>>>> Instead of having a paravirt op being called for each exception type
>>>>> prepend the Xen specific code to each exception entry. When running as
>>>>> Xen pv-guest just use the exception entry with prepended instructions,
>>>>> otherwise use the entry without the Xen specific code.
>>>> 
>>>> I think this is a nice cleanup, but I'm wondering if it would be even
>>>> nicer if the Xen part was kept out-of-line.  That is, could Xen have
>>>> little stubs like:
>>>> 
>>>> xen_alignment_check:
>>>> pop %rcx
>>>> pop %r11
>>>> jmp alignment_check
>>>> 
>>>> rather than using the macros in entry_64.S that you have?  Then you
>>>> could adjust set_trap_gate instead of pack_gate and maybe even do
>>>> something like:
>>>> 
>>>> #define set_trap_gate(..., name, ...) set_native_or_xen_trap_gate(...,
>>>> name, xen_##name, ...)
>>> 
>>> I think I'll have something like:
>>> 
>>> #define pv_trap_entry(name) (xen_pv_domain() ? xen_ ## name : name)
>>> 
>>> and use it like:
>>> 
>>> set_intr_gate(X86_TRAP_AC, pv_trap_entry(alignment_check));
>>> 
>>> This will avoid having to define macros for all variants of
>>> set_intr_gate(), e.g. set_intr_gate_ist(), set_system_intr_gate().
>>> 
>>> Do you have any objections?
>>> 
>> 
>> Sounds good to me.
>> 
>> FWIW, I have no real objection to putting the Xen entry right before the 
>> native entry and falling through.  I don't love the ip -= 3 bit, though, and 
>> I think that the PV_ENTRY macro is too magical.
>> 
>> This might be okay, though:
>> 
>> XEN_PV_ENTRY_FALLTHROUGH(foo)
>> ENTRY(foo)
> 
> ENTRY() aligns on 16 byte boundary. So I have to avoid ENTRY(foo) above
> in the Xen case when I want to fall through.
> 
> So either I have to do something like PV_ENTRY (I could avoid the magic
> "3" by using the xen_foo entry via pv_trap_entry()), or I need the stub
> with "jmp" for Xen.

Hmm.  Either the 16 byte alignment is pointless or the PV_ENTRT macro is wrong.

Anyway, the jmp is probably the best approach.  Then we could eventually make a 
.text.xen_pv section and discard it on non-xen_pv boots.

> 
> 
> Juergen

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v2] xen: get rid of paravirt op adjust_exception_frame

2017-08-01 Thread Andy Lutomirski
On Tue, Aug 1, 2017 at 3:39 AM, Juergen Gross  wrote:
> When running as Xen pv-guest the exception frame on the stack contains
> %r11 and %rcx additional to the other data pushed by the processor.
>
> Instead of having a paravirt op being called for each exception type
> prepend the Xen specific code to each exception entry. When running as
> Xen pv-guest just use the exception entry with prepended instructions,
> otherwise use the entry without the Xen specific code.
>
> Signed-off-by: Juergen Gross 
> ---
>  arch/x86/entry/entry_64.S | 22 ++
>  arch/x86/entry/entry_64_compat.S  |  1 -
>  arch/x86/include/asm/desc.h   | 16 ++
>  arch/x86/include/asm/paravirt.h   |  5 ---
>  arch/x86/include/asm/paravirt_types.h |  4 ---
>  arch/x86/include/asm/proto.h  |  3 ++
>  arch/x86/include/asm/traps.h  | 51 +--
>  arch/x86/kernel/asm-offsets_64.c  |  1 -
>  arch/x86/kernel/paravirt.c|  3 --
>  arch/x86/kernel/traps.c   | 57 
> ++-
>  arch/x86/xen/enlighten_pv.c   | 16 +-
>  arch/x86/xen/irq.c|  3 --
>  arch/x86/xen/xen-asm_64.S | 45 ---
>  arch/x86/xen/xen-ops.h|  1 -
>  14 files changed, 147 insertions(+), 81 deletions(-)
>
> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> index a9a8027a6c0e..602bcf68a32c 100644
> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -745,7 +745,6 @@ ENTRY(\sym)
> .endif
>
> ASM_CLAC
> -   PARAVIRT_ADJUST_EXCEPTION_FRAME
>
> .ifeq \has_error_code
> pushq   $-1 /* ORIG_RAX: no syscall to 
> restart */
> @@ -901,7 +900,7 @@ ENTRY(do_softirq_own_stack)
>  END(do_softirq_own_stack)
>
>  #ifdef CONFIG_XEN
> -idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0
> +idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0
>
>  /*
>   * A note on the "critical region" in our callback handler.
> @@ -967,8 +966,6 @@ ENTRY(xen_failsafe_callback)
> movq8(%rsp), %r11
> addq$0x30, %rsp
> pushq   $0  /* RIP */
> -   pushq   %r11
> -   pushq   %rcx
> jmp general_protection
>  1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
> movq(%rsp), %rcx
> @@ -997,9 +994,8 @@ idtentry int3   do_int3   
>   has_error_code=0paranoid=1 shift_ist=DEBUG_STACK
>  idtentry stack_segment do_stack_segmenthas_error_code=1
>
>  #ifdef CONFIG_XEN
> -idtentry xen_debug do_debughas_error_code=0
> -idtentry xen_int3  do_int3 has_error_code=0
> -idtentry xen_stack_segment do_stack_segmenthas_error_code=1
> +idtentry xendebug  do_debughas_error_code=0
> +idtentry xenint3   do_int3 has_error_code=0
>  #endif
>
>  idtentry general_protectiondo_general_protection   has_error_code=1
> @@ -1161,18 +1157,6 @@ END(error_exit)
>  /* Runs on exception stack */
>  ENTRY(nmi)
> /*
> -* Fix up the exception frame if we're on Xen.
> -* PARAVIRT_ADJUST_EXCEPTION_FRAME is guaranteed to push at most
> -* one value to the stack on native, so it may clobber the rdx
> -* scratch slot, but it won't clobber any of the important
> -* slots past it.
> -*
> -* Xen is a different story, because the Xen frame itself overlaps
> -* the "NMI executing" variable.
> -*/

Based on Andrew Cooper's email, it sounds like this function is just
straight-up broken on Xen PV.  Maybe change the comment to "XXX:
broken on Xen PV" or similar.

> +#if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV)
> +#define pv_trap_entry(name) (void *)(xen_pv_domain() ? xen_ ## name : name)
> +#else
> +#define pv_trap_entry(name) (void *)(name)
> +#endif
> +

Seems reasonable to me.

>  #ifdef CONFIG_X86_64
>
>  static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long 
> func,
> @@ -482,6 +490,14 @@ static inline void _set_gate(int gate, unsigned type, 
> void *addr,
> 0, 0, __KERNEL_CS); \
> } while (0)
>
> +#define set_intr_gate_pv(n, addr)  \
> +   do {\
> +   set_intr_gate_notrace(n, pv_trap_entry(addr));  \
> +   _trace_set_gate(n, GATE_INTERRUPT,  \
> +   pv_trap_entry(trace_##addr),\
> +   0, 0, __KERNEL_CS); \
> +   } while (0)

Any reason this can't be set_intr_gate((n), pv_trap_entry(addr))?  Or
does that g

Re: [Xen-devel] [PATCH v2] xen: get rid of paravirt op adjust_exception_frame

2017-08-01 Thread Andy Lutomirski
On Tue, Aug 1, 2017 at 4:38 PM, Andrew Cooper  wrote:
> On 01/08/2017 20:45, Andy Lutomirski wrote:
>> Also, IMO it would be nice to fully finish the job.  Remaining steps are:
>>
>> 1. Unsuck the SYSCALL entries on Xen PV.
>> 2. Unsuck the SYENTER entry on Xen PV.
>> 3. Make a xen_nmi that's actually correct (should be trivial)
>>
>> #1 is here:
>>
>> https://git.kernel.org/pub/scm/linux/kernel/git/luto/linux.git/commit/?h=x86/entry_syscall&id=14fee348e3e86c994400d68085217d1232a637d6
>
> If the
>
> /* Zero-extending 32-bit regs, do not remove */
> movl%eax, %eax
>
> comments are to be believed, then the entry logic needs reordering
> slightly to:
>
> GLOBAL(entry_*_compat_after_hwframe)
> movl%eax, %eax/* Zero-extending 32-bit regs, do not remove */
> pushq%rax/* pt_regs->orig_ax */

D'oh, right.  Juergen, want to make that change?

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v3] xen: get rid of paravirt op adjust_exception_frame

2017-08-07 Thread Andy Lutomirski
On Mon, Aug 7, 2017 at 1:56 PM, Boris Ostrovsky
 wrote:
>
>> diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
>> index 811e4ddb3f37..a3dcd83187ce 100644
>> --- a/arch/x86/xen/enlighten_pv.c
>> +++ b/arch/x86/xen/enlighten_pv.c
>> @@ -579,6 +579,71 @@ static void xen_write_ldt_entry(struct desc_struct *dt, 
>> int entrynum,
>>   preempt_enable();
>>  }
>>
>> +#ifdef CONFIG_X86_64
>> +static struct {
>> + void (*orig)(void);
>> + void (*xen)(void);
>> + bool ist_okay;
>> + bool handle;
>> +} trap_array[] = {
>> + { debug, xen_xendebug, true, true },
>> + { int3, xen_xenint3, true, true },
>> + { double_fault, xen_double_fault, true, false },
>
> Is it really worth adding 'handle' member to the structure because of a
> single special case? We don't expect to ever have another such vector.
>
> (TBH, I think current implementation of cvt_gate_to_trap() is clearer,
> even if it is not as general as what is in this patch. I know that Andy
> disagrees).

I have no real opinion either way.  I just think it's nicer to put it
in cvt_gate_to_trap() instead of the the traps.c setup code.

--Andy

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [PATCH v2] x86/xen/64: Rearrange the SYSCALL entries

2017-08-07 Thread Andy Lutomirski
Xen's raw SYSCALL entries are much less weird than native.  Rather
than fudging them to look like native entries, use the Xen-provided
stack frame directly.

This lets us eliminate entry_SYSCALL_64_after_swapgs and two uses of
the SWAPGS_UNSAFE_STACK paravirt hook.  The SYSENTER code would
benefit from similar treatment.

This makes one change to the native code path: the compat
instruction that clears the high 32 bits of %rax is moved slightly
later.  I'd be surprised if this affects performance at all.

Signed-off-by: Andy Lutomirski 
---

Changes from v1 (which I never actually emailed):
 - Fix zero-extension in the compat case.

 arch/x86/entry/entry_64.S|  9 ++---
 arch/x86/entry/entry_64_compat.S |  7 +++
 arch/x86/xen/xen-asm_64.S| 23 +--
 3 files changed, 14 insertions(+), 25 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index aa58155187c5..7cee92cf807f 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -142,14 +142,8 @@ ENTRY(entry_SYSCALL_64)
 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
 * it is too small to ever cause noticeable irq latency.
 */
-   SWAPGS_UNSAFE_STACK
-   /*
-* A hypervisor implementation might want to use a label
-* after the swapgs, so that it can do the swapgs
-* for the guest and jump here on syscall.
-*/
-GLOBAL(entry_SYSCALL_64_after_swapgs)
 
+   swapgs
movq%rsp, PER_CPU_VAR(rsp_scratch)
movqPER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
@@ -161,6 +155,7 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
pushq   %r11/* pt_regs->flags */
pushq   $__USER_CS  /* pt_regs->cs */
pushq   %rcx/* pt_regs->ip */
+GLOBAL(entry_SYSCALL_64_after_hwframe)
pushq   %rax/* pt_regs->orig_ax */
pushq   %rdi/* pt_regs->di */
pushq   %rsi/* pt_regs->si */
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index e1721dafbcb1..5314d7b8e5ad 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -183,21 +183,20 @@ ENDPROC(entry_SYSENTER_compat)
  */
 ENTRY(entry_SYSCALL_compat)
/* Interrupts are off on entry. */
-   SWAPGS_UNSAFE_STACK
+   swapgs
 
/* Stash user ESP and switch to the kernel stack. */
movl%esp, %r8d
movqPER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
-   /* Zero-extending 32-bit regs, do not remove */
-   movl%eax, %eax
-
/* Construct struct pt_regs on stack */
pushq   $__USER32_DS/* pt_regs->ss */
pushq   %r8 /* pt_regs->sp */
pushq   %r11/* pt_regs->flags */
pushq   $__USER32_CS/* pt_regs->cs */
pushq   %rcx/* pt_regs->ip */
+GLOBAL(entry_SYSCALL_compat_after_hwframe)
+   movl%eax, %eax  /* discard orig_ax high bits */
pushq   %rax/* pt_regs->orig_ax */
pushq   %rdi/* pt_regs->di */
pushq   %rsi/* pt_regs->si */
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index c3df43141e70..a8a4f4c460a6 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -82,34 +82,29 @@ RELOC(xen_sysret64, 1b+1)
  * rip
  * r11
  * rsp->rcx
- *
- * In all the entrypoints, we undo all that to make it look like a
- * CPU-generated syscall/sysenter and jump to the normal entrypoint.
  */
 
-.macro undo_xen_syscall
-   mov 0*8(%rsp), %rcx
-   mov 1*8(%rsp), %r11
-   mov 5*8(%rsp), %rsp
-.endm
-
 /* Normal 64-bit system call target */
 ENTRY(xen_syscall_target)
-   undo_xen_syscall
-   jmp entry_SYSCALL_64_after_swapgs
+   popq %rcx
+   popq %r11
+   jmp entry_SYSCALL_64_after_hwframe
 ENDPROC(xen_syscall_target)
 
 #ifdef CONFIG_IA32_EMULATION
 
 /* 32-bit compat syscall target */
 ENTRY(xen_syscall32_target)
-   undo_xen_syscall
-   jmp entry_SYSCALL_compat
+   popq %rcx
+   popq %r11
+   jmp entry_SYSCALL_compat_after_hwframe
 ENDPROC(xen_syscall32_target)
 
 /* 32-bit compat sysenter target */
 ENTRY(xen_sysenter_target)
-   undo_xen_syscall
+   mov 0*8(%rsp), %rcx
+   mov 1*8(%rsp), %r11
+   mov 5*8(%rsp), %rsp
jmp entry_SYSENTER_compat
 ENDPROC(xen_sysenter_target)
 
-- 
2.13.3


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v2] x86/xen/64: Rearrange the SYSCALL entries

2017-08-13 Thread Andy Lutomirski
On Sun, Aug 13, 2017 at 7:44 PM, Brian Gerst  wrote:
> On Mon, Aug 7, 2017 at 11:59 PM, Andy Lutomirski  wrote:
>>  /* Normal 64-bit system call target */
>>  ENTRY(xen_syscall_target)
>> -   undo_xen_syscall
>> -   jmp entry_SYSCALL_64_after_swapgs
>> +   popq %rcx
>> +   popq %r11
>> +   jmp entry_SYSCALL_64_after_hwframe
>>  ENDPROC(xen_syscall_target)
>>
>>  #ifdef CONFIG_IA32_EMULATION
>>
>>  /* 32-bit compat syscall target */
>>  ENTRY(xen_syscall32_target)
>> -   undo_xen_syscall
>> -   jmp entry_SYSCALL_compat
>> +   popq %rcx
>> +   popq %r11
>> +   jmp entry_SYSCALL_compat_after_hwframe
>>  ENDPROC(xen_syscall32_target)
>>
>>  /* 32-bit compat sysenter target */
>>  ENTRY(xen_sysenter_target)
>> -   undo_xen_syscall
>> +   mov 0*8(%rsp), %rcx
>> +   mov 1*8(%rsp), %r11
>> +   mov 5*8(%rsp), %rsp
>> jmp entry_SYSENTER_compat
>>  ENDPROC(xen_sysenter_target)
>
> This patch causes the iopl_32 and ioperm_32 self-tests to fail on a
> 64-bit PV kernel.  The 64-bit versions pass. It gets a seg fault after
> "parent: write to 0x80 (should fail)", and the fault isn't caught by
> the signal handler.  It just dumps back to the shell.  The tests pass
> after reverting this.

I can reproduce it if I emulate an AMD machine.  I can "fix" it like this:

diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index a8a4f4c460a6..6255e00f425e 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -97,6 +97,9 @@ ENDPROC(xen_syscall_target)
 ENTRY(xen_syscall32_target)
popq %rcx
popq %r11
+   movq $__USER32_DS, 4*8(%rsp)
+   movq $__USER32_CS, 1*8(%rsp)
+   movq %r11, 2*8(%rsp)
jmp entry_SYSCALL_compat_after_hwframe
 ENDPROC(xen_syscall32_target)

but I haven't tried to diagnose precisely what's going on.

Xen seems to be putting the 0xe0?? values in ss and cs, which oughtn't
to be a problem, but it kills opportunistic sysretl.  Maybe that's
triggering a preexisting bug?

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v2] x86/xen/64: Rearrange the SYSCALL entries

2017-08-13 Thread Andy Lutomirski
On Sun, Aug 13, 2017 at 10:53 PM, Andy Lutomirski  wrote:
> On Sun, Aug 13, 2017 at 7:44 PM, Brian Gerst  wrote:
>> On Mon, Aug 7, 2017 at 11:59 PM, Andy Lutomirski  wrote:
>>>  /* Normal 64-bit system call target */
>>>  ENTRY(xen_syscall_target)
>>> -   undo_xen_syscall
>>> -   jmp entry_SYSCALL_64_after_swapgs
>>> +   popq %rcx
>>> +   popq %r11
>>> +   jmp entry_SYSCALL_64_after_hwframe
>>>  ENDPROC(xen_syscall_target)
>>>
>>>  #ifdef CONFIG_IA32_EMULATION
>>>
>>>  /* 32-bit compat syscall target */
>>>  ENTRY(xen_syscall32_target)
>>> -   undo_xen_syscall
>>> -   jmp entry_SYSCALL_compat
>>> +   popq %rcx
>>> +   popq %r11
>>> +   jmp entry_SYSCALL_compat_after_hwframe
>>>  ENDPROC(xen_syscall32_target)
>>>
>>>  /* 32-bit compat sysenter target */
>>>  ENTRY(xen_sysenter_target)
>>> -   undo_xen_syscall
>>> +   mov 0*8(%rsp), %rcx
>>> +   mov 1*8(%rsp), %r11
>>> +   mov 5*8(%rsp), %rsp
>>> jmp entry_SYSENTER_compat
>>>  ENDPROC(xen_sysenter_target)
>>
>> This patch causes the iopl_32 and ioperm_32 self-tests to fail on a
>> 64-bit PV kernel.  The 64-bit versions pass. It gets a seg fault after
>> "parent: write to 0x80 (should fail)", and the fault isn't caught by
>> the signal handler.  It just dumps back to the shell.  The tests pass
>> after reverting this.
>
> I can reproduce it if I emulate an AMD machine.  I can "fix" it like this:
>
> diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
> index a8a4f4c460a6..6255e00f425e 100644
> --- a/arch/x86/xen/xen-asm_64.S
> +++ b/arch/x86/xen/xen-asm_64.S
> @@ -97,6 +97,9 @@ ENDPROC(xen_syscall_target)
>  ENTRY(xen_syscall32_target)
> popq %rcx
> popq %r11
> +   movq $__USER32_DS, 4*8(%rsp)
> +   movq $__USER32_CS, 1*8(%rsp)
> +   movq %r11, 2*8(%rsp)
> jmp entry_SYSCALL_compat_after_hwframe
>  ENDPROC(xen_syscall32_target)
>
> but I haven't tried to diagnose precisely what's going on.
>
> Xen seems to be putting the 0xe0?? values in ss and cs, which oughtn't
> to be a problem, but it kills opportunistic sysretl.  Maybe that's
> triggering a preexisting bug?

It is indeed triggering an existing but, but that bug is not a kernel
bug :)  It's this thing:

https://sourceware.org/bugzilla/show_bug.cgi?id=21269

See, we have this old legacy garbage in which, when running with
nonstandard SS, a certain special, otherwise nonsensical input to
sigaction() causes a stack switch.  Xen PV runs user code with a
nonstandard SS, and glibc accidentally passes this weird parameter to
sigaction() on a regular basis.  With this patch applied, the kernel
suddenly starts to *realize* that ss is weird, and boom.  (Or maybe it
increases the chance that SS is actually weird, since I'd expect this
to trip on #GP, not SYSCALL.  But I don't care quite enough to dig
further.)

Patch coming.

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [PATCH] x86/xen/64: Fix the reported SS and CS in SYSCALL

2017-08-14 Thread Andy Lutomirski
When I cleaned up the Xen SYSCALL entries, I inadvertently changed
the reported segment registers.  Before my patch, regs->ss was
__USER(32)_DS and regs->cs was __USER(32)_CS.  After the patch, they
are FLAT_USER_CS/DS(32).

This had a couple unfortunate effects.  It confused the
opportunistic fast return logic.  It also significantly increased
the risk of triggering a nasty glibc bug:

https://sourceware.org/bugzilla/show_bug.cgi?id=21269

Update the Xen entry code to change it back.

Reported-by: Brian Gerst 
Fixes: 8a9949bc71a7 ("x86/xen/64: Rearrange the SYSCALL entries")
Signed-off-by: Andy Lutomirski 
---
 arch/x86/xen/xen-asm_64.S | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index a8a4f4c460a6..c5fee2680abc 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -88,6 +88,15 @@ RELOC(xen_sysret64, 1b+1)
 ENTRY(xen_syscall_target)
popq %rcx
popq %r11
+
+   /*
+* Neither Xen nor the kernel really knows what the old SS and
+* CS were.  The kernel expects __USER_DS and __USER_CS, so
+* report those values even though Xen will guess its own values.
+*/
+   movq $__USER_DS, 4*8(%rsp)
+   movq $__USER_CS, 1*8(%rsp)
+
jmp entry_SYSCALL_64_after_hwframe
 ENDPROC(xen_syscall_target)
 
@@ -97,6 +106,15 @@ ENDPROC(xen_syscall_target)
 ENTRY(xen_syscall32_target)
popq %rcx
popq %r11
+
+   /*
+* Neither Xen nor the kernel really knows what the old SS and
+* CS were.  The kernel expects __USER32_DS and __USER32_CS, so
+* report those values even though Xen will guess its own values.
+*/
+   movq $__USER32_DS, 4*8(%rsp)
+   movq $__USER32_CS, 1*8(%rsp)
+
jmp entry_SYSCALL_compat_after_hwframe
 ENDPROC(xen_syscall32_target)
 
-- 
2.13.3


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v7 11/36] x86/mm: Add SME support for read_cr3_pa()

2017-06-20 Thread Andy Lutomirski
On Fri, Jun 16, 2017 at 11:51 AM, Tom Lendacky  wrote:
> The cr3 register entry can contain the SME encryption mask that indicates
> the PGD is encrypted.  The encryption mask should not be used when
> creating a virtual address from the cr3 register, so remove the SME
> encryption mask in the read_cr3_pa() function.
>
> During early boot SME will need to use a native version of read_cr3_pa(),
> so create native_read_cr3_pa().
>
> Signed-off-by: Tom Lendacky 
> ---
>  arch/x86/include/asm/processor-flags.h |3 ++-
>  arch/x86/include/asm/processor.h   |5 +
>  2 files changed, 7 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/include/asm/processor-flags.h 
> b/arch/x86/include/asm/processor-flags.h
> index 79aa2f9..cb6999c 100644
> --- a/arch/x86/include/asm/processor-flags.h
> +++ b/arch/x86/include/asm/processor-flags.h
> @@ -2,6 +2,7 @@
>  #define _ASM_X86_PROCESSOR_FLAGS_H
>
>  #include 
> +#include 
>
>  #ifdef CONFIG_VM86
>  #define X86_VM_MASKX86_EFLAGS_VM
> @@ -33,7 +34,7 @@
>   */
>  #ifdef CONFIG_X86_64
>  /* Mask off the address space ID bits. */
> -#define CR3_ADDR_MASK 0x7000ull
> +#define CR3_ADDR_MASK __sme_clr(0x7000ull)

Can you update the comment one line above, too?

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v2 2/3] x86 Test and expose CPUID faulting capabilities in /proc/cpuinfo

2016-09-14 Thread Andy Lutomirski
On Wed, Sep 14, 2016 at 3:03 PM, Kyle Huey  wrote:
> On Wed, Sep 14, 2016 at 2:35 PM, Dave Hansen
>  wrote:
>> On 09/14/2016 02:01 PM, Kyle Huey wrote:

>> Is any of this useful to optimize away at compile-time?  We have config
>> options for when we're running as a guest, and this seems like a feature
>> that isn't available when running on bare metal.
>
> On the contrary, this is only available when we're on bare metal.
> Neither Xen nor KVM virtualize CPUID faulting (although KVM correctly
> suppresses MSR_PLATFORM_INFO's report of support for it).

KVM could easily support this.  If rr starts using it, I think KVM
*should* add support, possibly even for older CPUs that don't support
the feature in hardware.

It's too bad that x86 doesn't give us the instruction bytes on a
fault.  Otherwise we could lazily switch this feature.

--Andy

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v2 2/3] x86 Test and expose CPUID faulting capabilities in /proc/cpuinfo

2016-09-15 Thread Andy Lutomirski
On Thu, Sep 15, 2016 at 12:11 PM, Kyle Huey  wrote:
> On Thu, Sep 15, 2016 at 3:25 AM, Jan Beulich  wrote:
> On 15.09.16 at 12:05,  wrote:
>>> On 14/09/16 22:01, Kyle Huey wrote:
 Xen advertises the underlying support for CPUID faulting but not does pass
 through writes to the relevant MSR, nor does it virtualize it, so it does
 not actually work. For now mask off the relevant bit on MSR_PLATFORM_INFO.
>>>
>>> Could you clarify in the commit message that it is PV guests that are
>>> affected.
>>
>> What makes you think HVM ones aren't?
>
> Testing on EC2, HVM guests are affected as well.  Not sure what to do
> about that.
>

It's kind of nasty, but it shouldn't be *too* hard to probe for this
thing during early boot.  Allocate a page somewhere that has the user
bit set, put something like this in it:

cpuid
inc %eax  /* return 1 */
movw %ax, %ss /* force %GP to get out of here */

Call it like this from asm (real asm, not inline):

FRAME_BEGIN
pushq %rbx

xorl %eax, %eax

/* Push return frame */
pushq %ss
pushq %rsp
addq $8, (%rsp)
pushfq
pushq %cs
pushq $end_of_cpuid_faulting_test

/* Call it! */
pushq $__USER_DS
pushq $0
pushq $X86_EFLAGS_FIXED  /* leave IF off when running the CPL3 stub */
pushq $__USER_CS
pushq [address of userspace stub]
INTERRUPT_RETURN

end_of_cpuid_faulting_test:
pop %rbx

FRAME_END

Run this after the main GDT is loaded but while the #GP vector is
temporarily pointing to:

movq SS-RIP(%rsp), %rsp  /* pop the real return frame */
INTERRUPT_RETURN

and with interrupts off.  The function should return 0 if CPUID
faulting works and 1 if it doesn't.

Yeah, this is gross, but it should work.  I'm not sure how okay I am
with putting this crap in the kernel...

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v2 2/3] x86 Test and expose CPUID faulting capabilities in /proc/cpuinfo

2016-09-15 Thread Andy Lutomirski
On Thu, Sep 15, 2016 at 1:38 PM, H. Peter Anvin  wrote:
> On September 14, 2016 6:17:51 PM PDT, Andy Lutomirski  
> wrote:
>>On Wed, Sep 14, 2016 at 3:03 PM, Kyle Huey  wrote:
>>> On Wed, Sep 14, 2016 at 2:35 PM, Dave Hansen
>>>  wrote:
>>>> On 09/14/2016 02:01 PM, Kyle Huey wrote:
>>
>>>> Is any of this useful to optimize away at compile-time?  We have
>>config
>>>> options for when we're running as a guest, and this seems like a
>>feature
>>>> that isn't available when running on bare metal.
>>>
>>> On the contrary, this is only available when we're on bare metal.
>>> Neither Xen nor KVM virtualize CPUID faulting (although KVM correctly
>>> suppresses MSR_PLATFORM_INFO's report of support for it).
>>
>>KVM could easily support this.  If rr starts using it, I think KVM
>>*should* add support, possibly even for older CPUs that don't support
>>the feature in hardware.
>>
>>It's too bad that x86 doesn't give us the instruction bytes on a
>>fault.  Otherwise we could lazily switch this feature.
>>
>>--Andy
>
> You can "always" examine the instruction bytes in memory... have to make sure 
> you properly consider the impact of race conditions though.

I'd rather avoid needing to worry about those race conditions if at
all possible, though.  Intel and AMD both have fancy "decode assists"
and such -- it would be quite nice IMO if we could get the same data
exposed in the handlers of synchronous faults.

If Intel or AMD were to do this for real, presumably the rule would be
that any fault-class exception caused by a validly-decoded instruction
at CPL3 (so #PF and #GP would count but #DB probably wouldn't, and #DF
wouldn't either unless the initial fault did) would stash away the
faulting instruction and other entries would instead stash away
"nothing here".  Some pair of MSRs or new instruction would read out
information.  Then we could accurately emulate CPUID, we could
accurately emulate page-faulting instructions if we cared, etc.  All
of the relevant hardware must already mostly exist because VMX and SVM
both have this capability.

--Andy

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v3 2/3] x86 Test and expose CPUID faulting capabilities in /proc/cpuinfo

2016-09-15 Thread Andy Lutomirski
On Thu, Sep 15, 2016 at 4:33 PM, Kyle Huey  wrote:

Reviewed-by: Andy Lutomirski 

although this is really Borislav's domain.

OTOH, if you're planning on changing Linux's Xen MSR helpers to mask
the feature out, that should be in the same patch or an earlier patch.

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v3 1/3] syscalls, x86 Expose arch_prctl on x86-32.

2016-09-15 Thread Andy Lutomirski
On Thu, Sep 15, 2016 at 4:33 PM, Kyle Huey  wrote:
> arch_prctl is currently 64-bit only. Wire it up for 32-bits, as a no-op for
> now. Rename the second arg to a more generic name.
>
> Signed-off-by: Kyle Huey 
> ---
>  arch/x86/entry/syscalls/syscall_32.tbl |  1 +
>  arch/x86/include/asm/proto.h   |  5 -
>  arch/x86/kernel/process.c  | 10 ++
>  arch/x86/kernel/process_64.c   | 33 +
>  arch/x86/kernel/ptrace.c   |  8 
>  arch/x86/um/Makefile   |  2 +-
>  arch/x86/um/syscalls_32.c  |  7 +++
>  arch/x86/um/syscalls_64.c  |  4 ++--
>  include/linux/compat.h |  2 ++
>  9 files changed, 52 insertions(+), 20 deletions(-)
>  create mode 100644 arch/x86/um/syscalls_32.c
>
> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl 
> b/arch/x86/entry/syscalls/syscall_32.tbl
> index f848572..666fa61 100644
> --- a/arch/x86/entry/syscalls/syscall_32.tbl
> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> @@ -386,3 +386,4 @@
>  377i386copy_file_range sys_copy_file_range
>  378i386preadv2 sys_preadv2 
> compat_sys_preadv2
>  379i386pwritev2sys_pwritev2
> compat_sys_pwritev2
> +380i386arch_prctl  compat_sys_arch_prctl   
> compat_sys_arch_prctl

Let's call this sys_arch_prctl_32, even if it's unconventional.  See below.

> diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
> index 9b9b30b..f0e86aa 100644
> --- a/arch/x86/include/asm/proto.h
> +++ b/arch/x86/include/asm/proto.h
> @@ -30,6 +30,9 @@ void x86_report_nx(void);
>
>  extern int reboot_force;
>
> -long do_arch_prctl(struct task_struct *task, int code, unsigned long addr);
> +long do_arch_prctl_common(struct task_struct *task, int code, unsigned long 
> addr);
> +#ifdef CONFIG_X86_64
> +long do_arch_prctl_64(struct task_struct *task, int code, unsigned long 
> addr);
> +#endif
>
>  #endif /* _ASM_X86_PROTO_H */
> diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
> index 62c0b0e..1421451 100644
> --- a/arch/x86/kernel/process.c
> +++ b/arch/x86/kernel/process.c
> @@ -567,3 +567,13 @@ unsigned long get_wchan(struct task_struct *p)
> } while (count++ < 16 && p->state != TASK_RUNNING);
> return 0;
>  }
> +
> +long do_arch_prctl_common(struct task_struct *task, int code, unsigned long 
> arg2)
> +{
> +   return -EINVAL;
> +}
> +
> +asmlinkage long compat_sys_arch_prctl(int code, unsigned long arg2)

I believe you mean COMPAT_SYSCALL_DEFINE2 here.

But I see what you're doing here.  Could you instead do:

#if defined(CONFIG_IA32_EMULATION) || defined(CONFIG_X86_32)
#ifdef CONFIG_X86_32
COMPAT_SYSCALL_DEFINE2(...)
#else
SYSCALL_DEFINE2(...)
#endif

... body here ...
#endif

and name the thing do_arch_prctl_32?

It's too bad we don't have a SYSCALL_DEFINE_32 macro.  But you could add one...


> diff --git a/arch/x86/um/syscalls_32.c b/arch/x86/um/syscalls_32.c
> new file mode 100644
> index 000..c6812c1
> --- /dev/null
> +++ b/arch/x86/um/syscalls_32.c
> @@ -0,0 +1,7 @@
> +#include 
> +#include 
> +
> +long compat_sys_arch_prctl(int code, unsigned long arg2)

COMPAT_SYSCALL_DEFINE2

Also, does this really need a new file?

> -long sys_arch_prctl(int code, unsigned long addr)
> +long sys_arch_prctl(int code, unsigned long arg2)

SYSCALL_DEFINE2

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v3 3/3] x86, arch_prctl Add ARCH_[GET|SET]_CPUID for controlling the CPUID instruction

2016-09-15 Thread Andy Lutomirski
On Thu, Sep 15, 2016 at 4:33 PM, Kyle Huey  wrote:
> Intel supports faulting on the CPUID instruction in newer processors. Bit
> 31 of MSR_PLATFORM_INFO advertises support for this feature. It is
> documented in detail in Section 2.3.2 of
> http://www.intel.com/content/dam/www/public/us/en/documents/application-notes/virtualization-technology-flexmigration-application-note.pdf
>
> Support for this is implemented as a new pair of arch_prctls, available on 
> both x86-32 and x86-64.  The structure mirrors PR_[GET|SET]_TSC.  Like the 
> TSC flag, CPUID faulting is propagated across forks.  Unlike the TSC flag, it 
> is reset (to CPUID enabled) on exec.
>
> Signed-off-by: Kyle Huey 
> ---
>  arch/x86/include/asm/msr-index.h  |   1 +
>  arch/x86/include/asm/thread_info.h|   5 +-
>  arch/x86/include/uapi/asm/prctl.h |   6 +
>  arch/x86/kernel/process.c |  98 -
>  fs/exec.c |   6 +
>  tools/testing/selftests/x86/Makefile  |   2 +-
>  tools/testing/selftests/x86/cpuid-fault.c | 234 
> ++
>  7 files changed, 349 insertions(+), 3 deletions(-)
>  create mode 100644 tools/testing/selftests/x86/cpuid-fault.c
>
> diff --git a/arch/x86/include/asm/msr-index.h 
> b/arch/x86/include/asm/msr-index.h
> index 83908d5..4aebec2 100644
> --- a/arch/x86/include/asm/msr-index.h
> +++ b/arch/x86/include/asm/msr-index.h
> @@ -53,6 +53,7 @@
>  #define MSR_MTRRcap0x00fe
>  #define MSR_IA32_BBL_CR_CTL0x0119
>  #define MSR_IA32_BBL_CR_CTL3   0x011e
> +#define MSR_MISC_FEATURES_ENABLES  0x0140
>
>  #define MSR_IA32_SYSENTER_CS   0x0174
>  #define MSR_IA32_SYSENTER_ESP  0x0175
> diff --git a/arch/x86/include/asm/thread_info.h 
> b/arch/x86/include/asm/thread_info.h
> index 8b7c8d8..e3c40c6 100644
> --- a/arch/x86/include/asm/thread_info.h
> +++ b/arch/x86/include/asm/thread_info.h
> @@ -93,6 +93,7 @@ struct thread_info {
>  #define TIF_SECCOMP8   /* secure computing */
>  #define TIF_USER_RETURN_NOTIFY 11  /* notify kernel of userspace return 
> */
>  #define TIF_UPROBE 12  /* breakpointed or singlestepping */
> +#define TIF_NOCPUID15  /* CPUID is not accessible in 
> userland */
>  #define TIF_NOTSC  16  /* TSC is not accessible in userland 
> */
>  #define TIF_IA32   17  /* IA32 compatibility process */
>  #define TIF_FORK   18  /* ret_from_fork */
> @@ -117,6 +118,7 @@ struct thread_info {
>  #define _TIF_SECCOMP   (1 << TIF_SECCOMP)
>  #define _TIF_USER_RETURN_NOTIFY(1 << TIF_USER_RETURN_NOTIFY)
>  #define _TIF_UPROBE(1 << TIF_UPROBE)
> +#define _TIF_NOCPUID   (1 << TIF_NOCPUID)
>  #define _TIF_NOTSC (1 << TIF_NOTSC)
>  #define _TIF_IA32  (1 << TIF_IA32)
>  #define _TIF_FORK  (1 << TIF_FORK)
> @@ -146,7 +148,7 @@ struct thread_info {
>
>  /* flags to check in __switch_to() */
>  #define _TIF_WORK_CTXSW  
>   \
> -   (_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP)
> +   (_TIF_IO_BITMAP|_TIF_NOCPUID|_TIF_NOTSC|_TIF_BLOCKSTEP)
>
>  #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
>  #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
> @@ -293,6 +295,7 @@ static inline bool in_ia32_syscall(void)
>  extern void arch_task_cache_init(void);
>  extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct 
> *src);
>  extern void arch_release_task_struct(struct task_struct *tsk);
> +extern void arch_post_exec(void);
>  #endif /* !__ASSEMBLY__ */
>
>  #endif /* _ASM_X86_THREAD_INFO_H */
> diff --git a/arch/x86/include/uapi/asm/prctl.h 
> b/arch/x86/include/uapi/asm/prctl.h
> index 3ac5032..c087e55 100644
> --- a/arch/x86/include/uapi/asm/prctl.h
> +++ b/arch/x86/include/uapi/asm/prctl.h
> @@ -6,4 +6,10 @@
>  #define ARCH_GET_FS 0x1003
>  #define ARCH_GET_GS 0x1004
>
> +/* Get/set the process' ability to use the CPUID instruction */
> +#define ARCH_GET_CPUID 0x1005
> +#define ARCH_SET_CPUID 0x1006
> +# define ARCH_CPUID_ENABLE 1   /* allow the use of the CPUID 
> instruction */
> +# define ARCH_CPUID_SIGSEGV2   /* throw a SIGSEGV instead of 
> reading the CPUID */
> +
>  #endif /* _ASM_X86_PRCTL_H */
> diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
> index 1421451..f307d5c 100644
> --- a/arch/x86/kernel/process.c
> +++ b/arch/x86/kernel/process.c
> @@ -32,6 +32,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>
>  /*
>   * per-CPU TSS segments. Threads are completely 'soft' on Linux,
> @@ -191,6 +192,75 @@ int set_tsc_mode(unsigned int val)
> return 0;
>  }
>
> +static void switch_cpuid_faulting(bool on)
> +{
> +   if (on)
> +   msr_set_bit(MSR_MISC_FEATURES_ENABLES, 0);
> +   else
> +   msr_clea

Re: [Xen-devel] [PATCH v3 4/4] KVM: VMX: Simplify segment_base

2017-02-14 Thread Andy Lutomirski
On Tue, Feb 14, 2017 at 11:42 AM, Thomas Garnier  wrote:
> The KVM segment_base function is confusing. This patch replaces integers
> with appropriate flags, simplify constructs and add comments.

It could pay to put this first in the series, but last is probably fine, too.

>
> Signed-off-by: Thomas Garnier 
> ---
> Based on next-20170213
> ---
>  arch/x86/kvm/vmx.c | 26 ++
>  1 file changed, 18 insertions(+), 8 deletions(-)
>
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 99167f20bc34..edb8326108dd 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -2062,25 +2062,35 @@ static unsigned long segment_base(u16 selector)
> struct desc_struct *d;
> unsigned long table_base;

This should go away IMO.  It should be struct desc_struct *table;

> +   table_base = get_current_gdt_rw_vaddr();

Then this can go away, too, and you can stop having the silly
get_current_gdt_rw_vaddr() function at all.

> +   d = (struct desc_struct *)table_base + (selector >> 3);

And this cast goes away too.

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v3 4/4] KVM: VMX: Simplify segment_base

2017-02-20 Thread Andy Lutomirski
On Fri, Feb 17, 2017 at 2:01 PM, Thomas Garnier  wrote:
> On Fri, Feb 17, 2017 at 1:00 PM, Jim Mattson  wrote:
>> On Fri, Feb 17, 2017 at 12:11 PM, Thomas Garnier  wrote:
>>> On Fri, Feb 17, 2017 at 9:49 AM, Jim Mattson  wrote:

 Can we use the read-only GDT here? When expanding the virtual address
 for 64-bit system descriptors, isn't it sufficient to check (d->s == 0
 && d->type != 0)?
>>>
>>> We can use the readonly GDT but I think doesn't matter one or the
>>> other here. We have to check specific types for LDT or TSS, other
>>> values describe other entries (cf Intel volume 3, 3.5) (for example 14
>>> & 15 on 64-bits are for trap & interrupt gates).
>>
>> According to volume 3 of the SDM, section 3.5.2:
>>
>> The following system descriptors expand to 16 bytes:
>> — Call gate descriptors (see Section 5.8.3.1, “IA-32e Mode Call Gates”)
>> — IDT gate descriptors (see Section 6.14.1, “64-Bit Mode IDT”)
>> — LDT and TSS descriptors (see Section 7.2.3, “TSS Descriptor in 64-bit 
>> mode”).
>>
>> All legal system descriptor types (except for 0: Upper 8 bytes of an 16-byte
>> descriptor) should get the high 32 bits of the base address from the next 
>> 8-byte
>> descriptor.
>>
>
> Ok, then I will test an updated version next week.
>

I'm going to send out some preliminary patches that just get rid of
this problem entirely.

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v2 5/6] x86/xen: Add a Xen-specific sync_core() implementation

2016-12-02 Thread Andy Lutomirski
On Dec 2, 2016 3:44 AM, "Andrew Cooper"  wrote:
>
> On 02/12/16 00:35, Andy Lutomirski wrote:
> > On Xen PV, CPUID is likely to trap, and Xen hypercalls aren't
> > guaranteed to serialize.  (Even CPUID isn't *really* guaranteed to
> > serialize on Xen PV, but, in practice, any trap it generates will
> > serialize.)
>
> Well, Xen will enabled CPUID Faulting wherever it can, which is
> realistically all IvyBridge hardware and newer.
>
> All hypercalls are a privilege change to cpl0.  I'd hope this condition
> is serialising, but I can't actually find any documentation proving or
> disproving this.

I don't know for sure.  IRET is serializing, and if Xen returns using
IRET, we're fine.

>
> >
> > On my laptop, CPUID(eax=1, ecx=0) is ~83ns and IRET-to-self is
> > ~110ns.  But Xen PV will trap CPUID if possible, so IRET-to-self
> > should end up being a nice speedup.
> >
> > Cc: Andrew Cooper 
> > Signed-off-by: Andy Lutomirski 
>
> CC'ing xen-devel and the Xen maintainers in Linux.
>
> As this is the only email from this series in my inbox, I will say this
> here, but it should really be against patch 6.
>
> A write to %cr2 is apparently (http://sandpile.org/x86/coherent.htm) not
> serialising on the 486, but I don't have a manual to hand to check.

I'll quote the (modern) SDM.  For self-modifying code "The use of one
of these options is not required for programs intended to run on the
Pentium or Intel486 processors,
but are recommended to ensure compatibility with the P6 and more
recent processor families.".  For cross-modifying code "The use of
this option is not required for programs intended to run on the
Intel486 processor, but is recommended
to ensure compatibility with the Pentium 4, Intel Xeon, P6 family, and
Pentium processors."  So I'm not sure there's a problem.

I can add an unconditional jump just to make sure.  It costs basically
nothing on modern CPUs.  (Also, CPUID also isn't serializing on 486
according to the table.)

--Andy

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v2 5/6] x86/xen: Add a Xen-specific sync_core() implementation

2016-12-02 Thread Andy Lutomirski
On Fri, Dec 2, 2016 at 9:16 AM, Andrew Cooper  wrote:
> On 02/12/16 17:07, Andy Lutomirski wrote:
>> On Dec 2, 2016 3:44 AM, "Andrew Cooper"  wrote:
>>> On 02/12/16 00:35, Andy Lutomirski wrote:
>>>> On Xen PV, CPUID is likely to trap, and Xen hypercalls aren't
>>>> guaranteed to serialize.  (Even CPUID isn't *really* guaranteed to
>>>> serialize on Xen PV, but, in practice, any trap it generates will
>>>> serialize.)
>>> Well, Xen will enabled CPUID Faulting wherever it can, which is
>>> realistically all IvyBridge hardware and newer.
>>>
>>> All hypercalls are a privilege change to cpl0.  I'd hope this condition
>>> is serialising, but I can't actually find any documentation proving or
>>> disproving this.
>> I don't know for sure.  IRET is serializing, and if Xen returns using
>> IRET, we're fine.
>
> All returns to a 64bit PV guest at defined points (hypercall return,
> exception entry, etc) are from SYSRET, not IRET.

But CPUID faulting isn't like this, right?  Unless Xen does
opportunistic SYSRET.

>
> Talking of, I still have a patch to remove
> PARAVIRT_ADJUST_EXCEPTION_FRAME which I need to complete and send upstream.
>
>>
>>>> On my laptop, CPUID(eax=1, ecx=0) is ~83ns and IRET-to-self is
>>>> ~110ns.  But Xen PV will trap CPUID if possible, so IRET-to-self
>>>> should end up being a nice speedup.
>>>>
>>>> Cc: Andrew Cooper 
>>>> Signed-off-by: Andy Lutomirski 
>>> CC'ing xen-devel and the Xen maintainers in Linux.
>>>
>>> As this is the only email from this series in my inbox, I will say this
>>> here, but it should really be against patch 6.
>>>
>>> A write to %cr2 is apparently (http://sandpile.org/x86/coherent.htm) not
>>> serialising on the 486, but I don't have a manual to hand to check.
>> I'll quote the (modern) SDM.  For self-modifying code "The use of one
>> of these options is not required for programs intended to run on the
>> Pentium or Intel486 processors,
>> but are recommended to ensure compatibility with the P6 and more
>> recent processor families.".  For cross-modifying code "The use of
>> this option is not required for programs intended to run on the
>> Intel486 processor, but is recommended
>> to ensure compatibility with the Pentium 4, Intel Xeon, P6 family, and
>> Pentium processors."  So I'm not sure there's a problem.
>
> Fair enough.  (Assuming similar properties hold for the older processors
> of other vendors.)

No, you were right -- a different section of the SDM contradicts it:

For Intel486 processors, a write to an instruction in the cache will
modify it in both the cache and memory, but if
the instruction was prefetched before the write, the old version of
the instruction could be the one executed. To
prevent the old instruction from being executed, flush the instruction
prefetch unit by coding a jump instruction
immediately after any write that modifies an instruction.

--Andy

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v2 5/6] x86/xen: Add a Xen-specific sync_core() implementation

2016-12-02 Thread Andy Lutomirski
On Dec 2, 2016 10:48 AM, "Boris Ostrovsky"  wrote:
>
> On 12/02/2016 06:44 AM, Andrew Cooper wrote:
> > On 02/12/16 00:35, Andy Lutomirski wrote:
> >> On Xen PV, CPUID is likely to trap, and Xen hypercalls aren't
> >> guaranteed to serialize.  (Even CPUID isn't *really* guaranteed to
> >> serialize on Xen PV, but, in practice, any trap it generates will
> >> serialize.)
> > Well, Xen will enabled CPUID Faulting wherever it can, which is
> > realistically all IvyBridge hardware and newer.
> >
> > All hypercalls are a privilege change to cpl0.  I'd hope this condition
> > is serialising, but I can't actually find any documentation proving or
> > disproving this.
> >
> >> On my laptop, CPUID(eax=1, ecx=0) is ~83ns and IRET-to-self is
> >> ~110ns.  But Xen PV will trap CPUID if possible, so IRET-to-self
> >> should end up being a nice speedup.
> >>
> >> Cc: Andrew Cooper 
> >> Signed-off-by: Andy Lutomirski 
> > CC'ing xen-devel and the Xen maintainers in Linux.
> >
> > As this is the only email from this series in my inbox, I will say this
> > here, but it should really be against patch 6.
> >
> > A write to %cr2 is apparently (http://sandpile.org/x86/coherent.htm) not
> > serialising on the 486, but I don't have a manual to hand to check.
> >
> > ~Andrew
> >
> >> ---
> >>  arch/x86/xen/enlighten.c | 35 +++
> >>  1 file changed, 35 insertions(+)
> >>
> >> diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
> >> index bdd855685403..1f765b41eee7 100644
> >> --- a/arch/x86/xen/enlighten.c
> >> +++ b/arch/x86/xen/enlighten.c
> >> @@ -311,6 +311,39 @@ static __read_mostly unsigned int 
> >> cpuid_leaf1_ecx_set_mask;
> >>  static __read_mostly unsigned int cpuid_leaf5_ecx_val;
> >>  static __read_mostly unsigned int cpuid_leaf5_edx_val;
> >>
> >> +static void xen_sync_core(void)
> >> +{
> >> +register void *__sp asm(_ASM_SP);
> >> +
> >> +#ifdef CONFIG_X86_32
> >> +asm volatile (
> >> +"pushl %%ss\n\t"
> >> +"pushl %%esp\n\t"
> >> +"addl $4, (%%esp)\n\t"
> >> +"pushfl\n\t"
> >> +"pushl %%cs\n\t"
> >> +"pushl $1f\n\t"
> >> +"iret\n\t"
> >> +"1:"
> >> +: "+r" (__sp) : : "cc");
>
> This breaks 32-bit PV guests.
>
> Why are we pushing %ss? We are not changing privilege levels so why not
> just flags, cs and eip (which, incidentally, does work)?
>

Doh!  I carefully tested 64-bit on Xen and 32-bit in user mode.

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [PATCH v3 2/4] Revert "x86/boot: Fail the boot if !M486 and CPUID is missing"

2016-12-05 Thread Andy Lutomirski
This reverts commit ed68d7e9b9cfb64f3045ffbcb108df03c09a0f98.

The patch wasn't quite correct -- there are non-Intel (and hence
non-486) CPUs that we support that don't have CPUID.  Since we no
longer require CPUID for sync_core(), just revert the patch.

I think the relevant CPUs are Geode and Elan, but I'm not sure.

In principle, we should try to do better at identifying CPUID-less
CPUs in early boot, but that's more complicated.

Reported-by: One Thousand Gnomes 
Cc: Matthew Whitehead 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: Denys Vlasenko 
Cc: H. Peter Anvin 
Cc: Josh Poimboeuf 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Signed-off-by: Andy Lutomirski 
---
 arch/x86/boot/cpu.c | 6 --
 1 file changed, 6 deletions(-)

diff --git a/arch/x86/boot/cpu.c b/arch/x86/boot/cpu.c
index 4224ede43b4e..26240dde081e 100644
--- a/arch/x86/boot/cpu.c
+++ b/arch/x86/boot/cpu.c
@@ -87,12 +87,6 @@ int validate_cpu(void)
return -1;
}
 
-   if (CONFIG_X86_MINIMUM_CPU_FAMILY <= 4 && !IS_ENABLED(CONFIG_M486) &&
-   !has_eflag(X86_EFLAGS_ID)) {
-   printf("This kernel requires a CPU with the CPUID instruction.  
Build with CONFIG_M486=y to run on this CPU.\n");
-   return -1;
-   }
-
if (err_flags) {
puts("This kernel requires the following features "
 "not present on the CPU:\n");
-- 
2.9.3


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [PATCH v3 4/4] x86/asm: Rewrite sync_core() to use IRET-to-self

2016-12-05 Thread Andy Lutomirski
Aside from being excessively slow, CPUID is problematic: Linux runs
on a handful of CPUs that don't have CPUID.  Use IRET-to-self
instead.  IRET-to-self works everywhere, so it makes testing easy.

For reference, On my laptop, IRET-to-self is ~110ns,
CPUID(eax=1, ecx=0) is ~83ns on native and very very slow under KVM,
and MOV-to-CR2 is ~42ns.

While we're at it: sync_core() serves a very specific purpose.
Document it.

Cc: "H. Peter Anvin" 
Signed-off-by: Andy Lutomirski 
---
 arch/x86/include/asm/processor.h | 77 
 1 file changed, 55 insertions(+), 22 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 64fbc937d586..201a956e345f 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -590,33 +590,66 @@ static __always_inline void cpu_relax(void)
 
 #define cpu_relax_lowlatency() cpu_relax()
 
-/* Stop speculative execution and prefetching of modified code. */
+/*
+ * This function forces the icache and prefetched instruction stream to
+ * catch up with reality in two very specific cases:
+ *
+ *  a) Text was modified using one virtual address and is about to be executed
+ * from the same physical page at a different virtual address.
+ *
+ *  b) Text was modified on a different CPU, may subsequently be
+ * executed on this CPU, and you want to make sure the new version
+ * gets executed.  This generally means you're calling this in a IPI.
+ *
+ * If you're calling this for a different reason, you're probably doing
+ * it wrong.
+ */
 static inline void sync_core(void)
 {
-   int tmp;
-
-#ifdef CONFIG_X86_32
/*
-* Do a CPUID if available, otherwise do a jump.  The jump
-* can conveniently enough be the jump around CPUID.
+* There are quite a few ways to do this.  IRET-to-self is nice
+* because it works on every CPU, at any CPL (so it's compatible
+* with paravirtualization), and it never exits to a hypervisor.
+* The only down sides are that it's a bit slow (it seems to be
+* a bit more than 2x slower than the fastest options) and that
+* it unmasks NMIs.  The "push %cs" is needed because, in
+* paravirtual environments, __KERNEL_CS may not be a valid CS
+* value when we do IRET directly.
+*
+* In case NMI unmasking or performance every becomes a problem,
+* the next best option appears to be MOV-to-CR2 and an
+* unconditional jump.  That sequence also works on all CPUs,
+* but it will fault at CPL3.
+*
+* CPUID is the conventional way, but it's nasty: it doesn't
+* exist on some 486-like CPUs, and it usually exits to a
+* hypervisor.
 */
-   asm volatile("cmpl %2,%1\n\t"
-"jl 1f\n\t"
-"cpuid\n"
-"1:"
-: "=a" (tmp)
-: "rm" (boot_cpu_data.cpuid_level), "ri" (0), "0" (1)
-: "ebx", "ecx", "edx", "memory");
+   register void *__sp asm(_ASM_SP);
+
+#ifdef CONFIG_X86_32
+   asm volatile (
+   "pushfl\n\t"
+   "pushl %%cs\n\t"
+   "pushl $1f\n\t"
+   "iret\n\t"
+   "1:"
+   : "+r" (__sp) : : "cc", "memory");
 #else
-   /*
-* CPUID is a barrier to speculative execution.
-* Prefetched instructions are automatically
-* invalidated when modified.
-*/
-   asm volatile("cpuid"
-: "=a" (tmp)
-: "0" (1)
-: "ebx", "ecx", "edx", "memory");
+   unsigned long tmp;
+
+   asm volatile (
+   "movq %%ss, %0\n\t"
+   "pushq %0\n\t"
+   "pushq %%rsp\n\t"
+   "addq $8, (%%rsp)\n\t"
+   "pushfq\n\t"
+   "movq %%cs, %0\n\t"
+   "pushq %0\n\t"
+   "pushq $1f\n\t"
+   "iretq\n\t"
+   "1:"
+   : "=r" (tmp), "+r" (__sp) : : "cc", "memory");
 #endif
 }
 
-- 
2.9.3


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [PATCH v3 3/4] x86/microcode/intel: Replace sync_core() with native_cpuid()

2016-12-05 Thread Andy Lutomirski
The Intel microcode driver is using sync_core() to mean "do CPUID
with EAX=1".  I want to rework sync_core(), but first the Intel
microcode driver needs to stop depending on its current behavior.

Reported-by: Henrique de Moraes Holschuh 
Acked-by: Borislav Petkov 
Signed-off-by: Andy Lutomirski 
---
 arch/x86/kernel/cpu/microcode/intel.c | 26 +++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/cpu/microcode/intel.c 
b/arch/x86/kernel/cpu/microcode/intel.c
index cdc0deab00c9..e0981bb2a351 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -356,6 +356,26 @@ get_matching_model_microcode(unsigned long start, void 
*data, size_t size,
return state;
 }
 
+static void cpuid_1(void)
+{
+   /*
+* According to the Intel SDM, Volume 3, 9.11.7:
+*
+*   CPUID returns a value in a model specific register in
+*   addition to its usual register return values. The
+*   semantics of CPUID cause it to deposit an update ID value
+*   in the 64-bit model-specific register at address 08BH
+*   (IA32_BIOS_SIGN_ID). If no update is present in the
+*   processor, the value in the MSR remains unmodified.
+*
+* Use native_cpuid -- this code runs very early and we don't
+* want to mess with paravirt.
+*/
+   unsigned int eax = 1, ebx, ecx = 0, edx;
+
+   native_cpuid(&eax, &ebx, &ecx, &edx);
+}
+
 static int collect_cpu_info_early(struct ucode_cpu_info *uci)
 {
unsigned int val[2];
@@ -385,7 +405,7 @@ static int collect_cpu_info_early(struct ucode_cpu_info 
*uci)
native_wrmsrl(MSR_IA32_UCODE_REV, 0);
 
/* As documented in the SDM: Do a CPUID 1 here */
-   sync_core();
+   cpuid_1();
 
/* get the current revision from MSR 0x8B */
native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
@@ -627,7 +647,7 @@ static int apply_microcode_early(struct ucode_cpu_info 
*uci, bool early)
native_wrmsrl(MSR_IA32_UCODE_REV, 0);
 
/* As documented in the SDM: Do a CPUID 1 here */
-   sync_core();
+   cpuid_1();
 
/* get the current revision from MSR 0x8B */
native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
@@ -927,7 +947,7 @@ static int apply_microcode_intel(int cpu)
wrmsrl(MSR_IA32_UCODE_REV, 0);
 
/* As documented in the SDM: Do a CPUID 1 here */
-   sync_core();
+   cpuid_1();
 
/* get the current revision from MSR 0x8B */
rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
-- 
2.9.3


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [PATCH v3 0/4] CPUID-less CPU/sync_core fixes and improvements

2016-12-05 Thread Andy Lutomirski
*** PATCHES 1 and 2 MAY BE 4.9 MATERIAL ***

Alan Cox pointed out that the 486 isn't the only supported CPU that
doesn't have CPUID.  Let's clean up the mess and make everything
faster while we're at it.

Patch 1 is intended to be an easy fix: it makes sync_core() work
without CPUID on all 32-bit kernels.  It should be quite safe.  This
will have a negligible performance cost during boot on kernels built
for newer CPUs.  With this in place, patch 2 reverts the buggy 486
check I added.

Patches 3-4 are meant to improve the situation.  Patch 3 cleans up
the Intel microcode loader and the patch 4 (which depends on patch 3
to work correctly) stops using CPUID in sync_core() altogether.

Changes from v2:
 - Switch to IRET-to-self and get rid of all the paravirt code.
 - Further immprove the sync_core() comment.

Changes from v1:
 - Fix Xen
 - Add timing info to the changelog (hint: 2x speedup)
 - Document patch 1 a bit better.

Andy Lutomirski (4):
  x86/asm/32: Make sync_core() handle missing CPUID on all 32-bit
kernels
  Revert "x86/boot: Fail the boot if !M486 and CPUID is missing"
  x86/microcode/intel: Replace sync_core() with native_cpuid()
  x86/asm: Rewrite sync_core() to use IRET-to-self

 arch/x86/boot/cpu.c   |  6 ---
 arch/x86/include/asm/processor.h  | 77 +--
 arch/x86/kernel/cpu/microcode/intel.c | 26 ++--
 3 files changed, 78 insertions(+), 31 deletions(-)

-- 
2.9.3


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [PATCH v3 1/4] x86/asm/32: Make sync_core() handle missing CPUID on all 32-bit kernels

2016-12-05 Thread Andy Lutomirski
We support various non-Intel CPUs that don't have the CPUID
instruction, so the M486 test was wrong.  For now, fix it with a big
hammer: handle missing CPUID on all 32-bit CPUs.

Reported-by: One Thousand Gnomes 
Signed-off-by: Andy Lutomirski 
---
 arch/x86/include/asm/processor.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 984a7bf17f6a..64fbc937d586 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -595,7 +595,7 @@ static inline void sync_core(void)
 {
int tmp;
 
-#ifdef CONFIG_M486
+#ifdef CONFIG_X86_32
/*
 * Do a CPUID if available, otherwise do a jump.  The jump
 * can conveniently enough be the jump around CPUID.
-- 
2.9.3


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v3 4/4] x86/asm: Rewrite sync_core() to use IRET-to-self

2016-12-06 Thread Andy Lutomirski
On Tue, Dec 6, 2016 at 1:49 AM, Jan Beulich  wrote:
 On 06.12.16 at 10:25,  wrote:
>> On Tue, Dec 06, 2016 at 01:46:37AM -0700, Jan Beulich wrote:
>>> > +  asm volatile (
>>> > +  "pushfl\n\t"
>>> > +  "pushl %%cs\n\t"
>>> > +  "pushl $1f\n\t"
>>> > +  "iret\n\t"
>>> > +  "1:"
>>> > +  : "+r" (__sp) : : "cc", "memory");
>>>
>>> I don't thing EFLAGS (i.e. "cc") gets modified anywhere here. And
>>> the memory clobber would perhaps better be pulled out into an
>>> explicit barrier() invocation (making it more obvious what it's needed
>>> for)?
>>
>> EVerything that implies a memory barrier (and I think serializing
>> instructions do that) also imply a compiler barrier.
>>
>> Not doing the memory clobber gets you inconsistency wrt everything else.
>
> Well, I didn't say dropping the memory clobber altogether, but
> split it into a separate barrier() invocation (placed perhaps after
> the #endif).

I'll add a comment.  I'm fixing up the constraints, too.  (Although if
gcc allocated tmp into rsp, that would be very strange indeed.)

--Andy

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v3 4/4] x86/asm: Rewrite sync_core() to use IRET-to-self

2016-12-06 Thread Andy Lutomirski
On Mon, Dec 5, 2016 at 11:52 PM, Borislav Petkov  wrote:
> On Mon, Dec 05, 2016 at 01:32:43PM -0800, Andy Lutomirski wrote:
>> Aside from being excessively slow, CPUID is problematic: Linux runs
>> on a handful of CPUs that don't have CPUID.  Use IRET-to-self
>> instead.  IRET-to-self works everywhere, so it makes testing easy.
>>
>> For reference, On my laptop, IRET-to-self is ~110ns,
>> CPUID(eax=1, ecx=0) is ~83ns on native and very very slow under KVM,
>> and MOV-to-CR2 is ~42ns.
>>
>> While we're at it: sync_core() serves a very specific purpose.
>> Document it.
>>
>> Cc: "H. Peter Anvin" 
>> Signed-off-by: Andy Lutomirski 
>> ---
>>  arch/x86/include/asm/processor.h | 77 
>> 
>>  1 file changed, 55 insertions(+), 22 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/processor.h 
>> b/arch/x86/include/asm/processor.h
>> index 64fbc937d586..201a956e345f 100644
>> --- a/arch/x86/include/asm/processor.h
>> +++ b/arch/x86/include/asm/processor.h
>> @@ -590,33 +590,66 @@ static __always_inline void cpu_relax(void)
>>
>>  #define cpu_relax_lowlatency() cpu_relax()
>>
>> -/* Stop speculative execution and prefetching of modified code. */
>> +/*
>> + * This function forces the icache and prefetched instruction stream to
>> + * catch up with reality in two very specific cases:
>> + *
>> + *  a) Text was modified using one virtual address and is about to be 
>> executed
>> + * from the same physical page at a different virtual address.
>> + *
>> + *  b) Text was modified on a different CPU, may subsequently be
>> + * executed on this CPU, and you want to make sure the new version
>> + * gets executed.  This generally means you're calling this in a IPI.
>> + *
>> + * If you're calling this for a different reason, you're probably doing
>> + * it wrong.
>
> "... and think hard before you call this - it is slow."
>
> I'd add that now that it is even slower than CPUID.

But only barely.  And it's hugely faster than CPUID under KVM or
similar.  And it works on all CPUs.

>
>> + */
>>  static inline void sync_core(void)
>>  {
>> - int tmp;
>> -
>> -#ifdef CONFIG_X86_32
>>   /*
>> -  * Do a CPUID if available, otherwise do a jump.  The jump
>> -  * can conveniently enough be the jump around CPUID.
>> +  * There are quite a few ways to do this.  IRET-to-self is nice
>> +  * because it works on every CPU, at any CPL (so it's compatible
>> +  * with paravirtualization), and it never exits to a hypervisor.
>> +  * The only down sides are that it's a bit slow (it seems to be
>> +  * a bit more than 2x slower than the fastest options) and that
>> +  * it unmasks NMIs.
>
> Ewww, I hadn't thought of that angle. Aren't we going to get in all
> kinds of hard to debug issues due to that couple of cycles window of
> unmasked NMIs?
>
> We sync_core in some NMI handler and then right in the middle of it we
> get another NMI. Yeah, we have the nested NMI stuff still but I'd like
> to avoid complications if possible.

I'll counter with:

1. Why on earth would an NMI call sync_core()?

2. We have very careful and code to handle this issue, and NMIs really
do cause page faults which have exactly the same problem.

So I'm not too worried.

>
>> The "push %cs" is needed because, in
>> +  * paravirtual environments, __KERNEL_CS may not be a valid CS
>> +  * value when we do IRET directly.
>> +  *
>> +  * In case NMI unmasking or performance every becomes a problem,
>> +  * the next best option appears to be MOV-to-CR2 and an
>> +  * unconditional jump.  That sequence also works on all CPUs,
>> +  * but it will fault at CPL3.
>
> Does it really have to be non-priviledged?

Unless we want to declare lguest unsupported, delete it from the tree,
or, sigh, actually maintain it, then yes :(  native_write_cr2() would
work on Xen, but it's slow.

>
> If not, there are a couple more serializing insns:
>
> "• Privileged serializing instructions — INVD, INVEPT, INVLPG,
> INVVPID, LGDT, LIDT, LLDT, LTR, MOV (to control register, with the
> exception of MOV CR83), MOV (to debug register), WBINVD, and WRMSR"
>
> What about INVD? It is expensive too :-)

Only if you write the patch and label it:

Snickered-at-by: Andy Lutomirski 

>
> Can't we use, MOV %dr or so? With previously saving/restoring the reg?
>
> WBINVD could be another candidate, albeit a big hammer.
>
> WRMSR maybe too. If it faults, it's fine too because then you have the
> IRET automagically. Hell, we could even make it fault on purpose by
> writing into an invalid MSR but then we're back to the unmasking NMIs...
> :-\

I still like MOV-to-CR2 better than all of these.

--Andy

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [RFC v3 00/13] linux: generalize sections, ranges and linker tables

2016-08-09 Thread Andy Lutomirski
On Aug 9, 2016 7:09 PM, "James Bottomley" <
james.bottom...@hansenpartnership.com> wrote:
>
> On Tue, 2016-08-09 at 15:24 +0100, One Thousand Gnomes wrote:
> > > table development go under copyleft-next, Rusty recently asked for
> > > code to go in prior to the license tag being added denoting this
> > > license as GPL-compatible [3] -- I had noted in the patch
> > > submission which annotated copyleft-next's compatibility to GPLv2
> > > that copyleft-next is the license of choice for ongoing kernel
> > > development on my end [4]. If this is objectionable I'm happy to
> > > change it to GPLv2 however I'd like a reason provided as I've gone
> > > through all possible channels to ensure this is kosher, including
> > > vetting by 3 attorneys now, 2 at SUSE.
> >
> > You don't need a new tag, you can use "GPL" or "GPL and additional
> > rights". In fact you don't want any other tag because when combined
> >  with the kernel it is GPLv2 anyway because the only way the two are
> > fully compatible is for the kernel community to license the derived
> > work under the GPL.
>
> This is the module tag ... it says what licence the module is under,
> not the licence for the module combined with the kernel, which is
> always GPLv2 because the stricter licence rules.

Then why isn't "BSD" in the license_is_gpl_compatible list?
___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v1 2/3] x86/xen/time: setup vcpu 0 time info page

2017-01-26 Thread Andy Lutomirski
On Wed, Jan 25, 2017 at 9:33 AM, Joao Martins  wrote:
> In order to support pvclock vdso on xen we need to setup the time
> info page for vcpu 0 and register the page with Xen using the
> VCPUOP_register_vcpu_time_memory_area hypercall. This hypercall
> will also forcefully update the pvti which will set some of the
> necessary flags for vdso. Afterwards we check if it supports the
> PVCLOCK_TSC_STABLE_BIT flag which is mandatory for having
> vdso/vsyscall support. And if so, it will set the cpu 0 pvti that
> will be later on used when mapping the vdso image.
>
> The xen headers are also updated to include the new hypercall for
> registering the secondary vcpu_time_info struct.

No objections from me assuming the code is correct.

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v1 1/3] x86/pvclock: add setter for pvclock_pvti_cpu0_va

2017-01-26 Thread Andy Lutomirski
On Wed, Jan 25, 2017 at 9:33 AM, Joao Martins  wrote:
> Right now there is only a pvclock_pvti_cpu0_va() which is defined
> on kvmclock since:
>
> commit dac16fba6fc5
> ("x86/vdso: Get pvclock data from the vvar VMA instead of the fixmap")
>
> The only user of this interface was kvm. This commit moves
> pvclock_pvti_cpu0_va to pvclock which is a more generic place to have it
> and adds the correspondent setter routine for it. This allows other
> pvclock-based clocksources to use it, such as Xen.

With a minor nit:

Acked-by: Andy Lutomirski 

> +#else
> +static inline void pvclock_set_pvti_cpu0_va(struct 
> pvclock_vsyscall_time_info *pvti)
> +{
> +}

How about just not providing pvclock_set_pvti_cpu0_va() in this case?
It'll save three lines of code, and, more importantly, it will force
us to notice if we screw up the Kconfig stuff.

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v1 3/3] MAINTAINERS: xen, kvm: track pvclock-abi.h changes

2017-01-26 Thread Andy Lutomirski
On Wed, Jan 25, 2017 at 9:33 AM, Joao Martins  wrote:
> This file defines an ABI shared between guest and hypervisor(s)
> (KVM, Xen) and as such there should be an correspondent entry in
> MAINTAINERS file. Notice that there's already a text notice at the
> top of the header file, hence this commit simply enforces it more
> explicitly and have both peers noticed when such changes happen.
>
> Signed-off-by: Joao Martins 
> ---
> This was suggested by folks at xen-devel as we missed some of the
> ABI additions (e.g. flags field in pvti, TSC stable bit) - so this
> patch is to help preventing that from happening. Alternatively I
> could instead add a "PVCLOCK ABI" section in this file with the
> two mailing lists.

If you do the latter, please add me as an R:.

--Andy

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v2 2/3] x86: Remap GDT tables in the Fixmap section

2017-01-26 Thread Andy Lutomirski
On Thu, Jan 26, 2017 at 8:59 AM, Thomas Garnier  wrote:
> Each processor holds a GDT in its per-cpu structure. The sgdt
> instruction gives the base address of the current GDT. This address can
> be used to bypass KASLR memory randomization. With another bug, an
> attacker could target other per-cpu structures or deduce the base of
> the main memory section (PAGE_OFFSET).
>
> This patch relocates the GDT table for each processor inside the
> Fixmap section. The space is reserved based on number of supported
> processors.
>
> For consistency, the remapping is done by default on 32 and 64 bit.
>
> Each processor switches to its remapped GDT at the end of
> initialization. For hibernation, the main processor returns with the
> original GDT and switches back to the remapping at completion.
>
> This patch was tested on both architectures. Hibernation and KVM were
> both tested specially for their usage of the GDT.

I like this version much better.  Thanks!

--Andy

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v2 3/3] x86: Make the GDT remapping read-only on 64 bit

2017-02-01 Thread Andy Lutomirski
On Wed, Feb 1, 2017 at 1:15 AM, Ingo Molnar  wrote:
>
> * Thomas Garnier  wrote:
>
>> This patch makes the GDT remapped pages read-only to prevent corruption.
>> This change is done only on 64 bit.
>


>>
>> - table_base = gdt->address;
>> + table_base = (unsigned long)get_current_direct_gdt();
>
> Instead of spreading these type casts far and wide please introduce another
> accessor the returns 'unsigned long':
>
> get_cpu_gdt_rw_vaddr()
>

That whole function is an abomination.  How about replacing 'unsigned
long table_base' with 'struct desc_struct *table'?  If you're feeling
really adventurous, *delete* that function and replace all of its
users with something sane.

--Andy

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v2 3/3] x86: Make the GDT remapping read-only on 64 bit

2017-02-01 Thread Andy Lutomirski
On Thu, Jan 26, 2017 at 8:59 AM, Thomas Garnier  wrote:
> This patch makes the GDT remapped pages read-only to prevent corruption.
> This change is done only on 64 bit.
>
> The native_load_tr_desc function was adapted to correctly handle a
> read-only GDT. The LTR instruction always writes to the GDT TSS entry.
> This generates a page fault if the GDT is read-only. This change checks
> if the current GDT is a remap and swap GDTs as needed. This function was
> tested by booting multiple machines and checking hibernation works
> properly.
>
> KVM SVM and VMX were adapted to use the writeable GDT. On VMX, the
> per-cpu variable was removed for functions to fetch the original GDT.
> Instead of reloading the previous GDT, VMX will reload the fixmap GDT as
> expected. For testing, VMs were started and restored on multiple
> configurations.
>
> Signed-off-by: Thomas Garnier 
> ---
> Based on next-20170125
> ---
>  arch/x86/include/asm/desc.h  | 46 
> +++-
>  arch/x86/include/asm/processor.h |  1 +
>  arch/x86/kernel/cpu/common.c | 28 ++--
>  arch/x86/kvm/svm.c   |  4 +---
>  arch/x86/kvm/vmx.c   | 15 +
>  5 files changed, 70 insertions(+), 24 deletions(-)
>
> diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
> index 4cc176f57b78..ca7b2224fcb4 100644
> --- a/arch/x86/include/asm/desc.h
> +++ b/arch/x86/include/asm/desc.h
> @@ -52,6 +52,12 @@ static inline struct desc_struct 
> *get_cpu_direct_gdt(unsigned int cpu)
> return per_cpu(gdt_page, cpu).gdt;
>  }
>
> +/* Provide the current original GDT */
> +static inline struct desc_struct *get_current_direct_gdt(void)
> +{
> +   return this_cpu_ptr(&gdt_page)->gdt;
> +}

I'm assuming that the reason that this isn't part of patch 2 and used
instead of the version that takes cpu as a parameter is that TLS
doesn't work until the GDT is set up.  If so, perhaps that's worthy of
a comment in patch 2.

But give this_cpu_read(gdt_page.gdt) a try, please.

> +/*
> + * The LTR instruction marks the TSS GDT entry as busy. In 64bit, the GDT is
> + * a read-only remapping. To prevent a page fault, the GDT is switched to the
> + * original writeable version when needed.
> + */
> +#ifdef CONFIG_X86_64
> +static inline void native_load_tr_desc(void)
> +{
> +   struct desc_ptr gdt;
> +   int cpu = raw_smp_processor_id();
> +   bool restore = false;
> +   struct desc_struct *fixmap_gdt;
> +
> +   native_store_gdt(&gdt);

Off the top of my head, this is something like 10 cycles.  IMO that's
fast enough not to worry about the regression this will cause to KVM
exits.  In any event, we'll get that back and *much* more when we do
the optimizations that this series enables.

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [PATCH v4 0/4] CPUID-less CPU/sync_core fixes and improvements

2016-12-09 Thread Andy Lutomirski
*** PATCHES 1 and 2 MAY BE 4.9 MATERIAL ***

Alan Cox pointed out that the 486 isn't the only supported CPU that
doesn't have CPUID.  Let's clean up the mess and make everything
faster while we're at it.

Patch 1 is intended to be an easy fix: it makes sync_core() work
without CPUID on all 32-bit kernels.  It should be quite safe.  This
will have a negligible performance cost during boot on kernels built
for newer CPUs.  With this in place, patch 2 reverts the buggy 486
check I added.

Patches 3-4 are meant to improve the situation.  Patch 3 cleans up
the Intel microcode loader and the patch 4 (which depends on patch 3
to work correctly) stops using CPUID in sync_core() altogether.

Changes from v3:
 - Improve sync_core() comments.
 - Tidy up sync_core() asm.

Changes from v2:
 - Switch to IRET-to-self and get rid of all the paravirt code.
 - Further immprove the sync_core() comment.

Changes from v1:
 - Fix Xen
 - Add timing info to the changelog (hint: 2x speedup)
 - Document patch 1 a bit better.

Andy Lutomirski (4):
  x86/asm/32: Make sync_core() handle missing CPUID on all 32-bit
kernels
  Revert "x86/boot: Fail the boot if !M486 and CPUID is missing"
  x86/microcode/intel: Replace sync_core() with native_cpuid()
  x86/asm: Rewrite sync_core() to use IRET-to-self

 arch/x86/boot/cpu.c   |  6 ---
 arch/x86/include/asm/processor.h  | 80 +--
 arch/x86/kernel/cpu/microcode/intel.c | 26 ++--
 3 files changed, 81 insertions(+), 31 deletions(-)

-- 
2.9.3


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [PATCH v4 2/4] Revert "x86/boot: Fail the boot if !M486 and CPUID is missing"

2016-12-09 Thread Andy Lutomirski
This reverts commit ed68d7e9b9cfb64f3045ffbcb108df03c09a0f98.

The patch wasn't quite correct -- there are non-Intel (and hence
non-486) CPUs that we support that don't have CPUID.  Since we no
longer require CPUID for sync_core(), just revert the patch.

I think the relevant CPUs are Geode and Elan, but I'm not sure.

In principle, we should try to do better at identifying CPUID-less
CPUs in early boot, but that's more complicated.

Reported-by: One Thousand Gnomes 
Cc: Matthew Whitehead 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: Denys Vlasenko 
Cc: H. Peter Anvin 
Cc: Josh Poimboeuf 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Signed-off-by: Andy Lutomirski 
---
 arch/x86/boot/cpu.c | 6 --
 1 file changed, 6 deletions(-)

diff --git a/arch/x86/boot/cpu.c b/arch/x86/boot/cpu.c
index 4224ede43b4e..26240dde081e 100644
--- a/arch/x86/boot/cpu.c
+++ b/arch/x86/boot/cpu.c
@@ -87,12 +87,6 @@ int validate_cpu(void)
return -1;
}
 
-   if (CONFIG_X86_MINIMUM_CPU_FAMILY <= 4 && !IS_ENABLED(CONFIG_M486) &&
-   !has_eflag(X86_EFLAGS_ID)) {
-   printf("This kernel requires a CPU with the CPUID instruction.  
Build with CONFIG_M486=y to run on this CPU.\n");
-   return -1;
-   }
-
if (err_flags) {
puts("This kernel requires the following features "
 "not present on the CPU:\n");
-- 
2.9.3


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [PATCH v4 3/4] x86/microcode/intel: Replace sync_core() with native_cpuid()

2016-12-09 Thread Andy Lutomirski
The Intel microcode driver is using sync_core() to mean "do CPUID
with EAX=1".  I want to rework sync_core(), but first the Intel
microcode driver needs to stop depending on its current behavior.

Reported-by: Henrique de Moraes Holschuh 
Acked-by: Borislav Petkov 
Signed-off-by: Andy Lutomirski 
---
 arch/x86/kernel/cpu/microcode/intel.c | 26 +++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/cpu/microcode/intel.c 
b/arch/x86/kernel/cpu/microcode/intel.c
index cdc0deab00c9..e0981bb2a351 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -356,6 +356,26 @@ get_matching_model_microcode(unsigned long start, void 
*data, size_t size,
return state;
 }
 
+static void cpuid_1(void)
+{
+   /*
+* According to the Intel SDM, Volume 3, 9.11.7:
+*
+*   CPUID returns a value in a model specific register in
+*   addition to its usual register return values. The
+*   semantics of CPUID cause it to deposit an update ID value
+*   in the 64-bit model-specific register at address 08BH
+*   (IA32_BIOS_SIGN_ID). If no update is present in the
+*   processor, the value in the MSR remains unmodified.
+*
+* Use native_cpuid -- this code runs very early and we don't
+* want to mess with paravirt.
+*/
+   unsigned int eax = 1, ebx, ecx = 0, edx;
+
+   native_cpuid(&eax, &ebx, &ecx, &edx);
+}
+
 static int collect_cpu_info_early(struct ucode_cpu_info *uci)
 {
unsigned int val[2];
@@ -385,7 +405,7 @@ static int collect_cpu_info_early(struct ucode_cpu_info 
*uci)
native_wrmsrl(MSR_IA32_UCODE_REV, 0);
 
/* As documented in the SDM: Do a CPUID 1 here */
-   sync_core();
+   cpuid_1();
 
/* get the current revision from MSR 0x8B */
native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
@@ -627,7 +647,7 @@ static int apply_microcode_early(struct ucode_cpu_info 
*uci, bool early)
native_wrmsrl(MSR_IA32_UCODE_REV, 0);
 
/* As documented in the SDM: Do a CPUID 1 here */
-   sync_core();
+   cpuid_1();
 
/* get the current revision from MSR 0x8B */
native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
@@ -927,7 +947,7 @@ static int apply_microcode_intel(int cpu)
wrmsrl(MSR_IA32_UCODE_REV, 0);
 
/* As documented in the SDM: Do a CPUID 1 here */
-   sync_core();
+   cpuid_1();
 
/* get the current revision from MSR 0x8B */
rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
-- 
2.9.3


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [PATCH v4 1/4] x86/asm/32: Make sync_core() handle missing CPUID on all 32-bit kernels

2016-12-09 Thread Andy Lutomirski
We support various non-Intel CPUs that don't have the CPUID
instruction, so the M486 test was wrong.  For now, fix it with a big
hammer: handle missing CPUID on all 32-bit CPUs.

Reported-by: One Thousand Gnomes 
Signed-off-by: Andy Lutomirski 
---
 arch/x86/include/asm/processor.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 984a7bf17f6a..64fbc937d586 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -595,7 +595,7 @@ static inline void sync_core(void)
 {
int tmp;
 
-#ifdef CONFIG_M486
+#ifdef CONFIG_X86_32
/*
 * Do a CPUID if available, otherwise do a jump.  The jump
 * can conveniently enough be the jump around CPUID.
-- 
2.9.3


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [PATCH v4 4/4] x86/asm: Rewrite sync_core() to use IRET-to-self

2016-12-09 Thread Andy Lutomirski
Aside from being excessively slow, CPUID is problematic: Linux runs
on a handful of CPUs that don't have CPUID.  Use IRET-to-self
instead.  IRET-to-self works everywhere, so it makes testing easy.

For reference, On my laptop, IRET-to-self is ~110ns,
CPUID(eax=1, ecx=0) is ~83ns on native and very very slow under KVM,
and MOV-to-CR2 is ~42ns.

While we're at it: sync_core() serves a very specific purpose.
Document it.

Cc: "H. Peter Anvin" 
Signed-off-by: Andy Lutomirski 
---
 arch/x86/include/asm/processor.h | 80 +---
 1 file changed, 58 insertions(+), 22 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 64fbc937d586..ceb1f4d3f3fa 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -590,33 +590,69 @@ static __always_inline void cpu_relax(void)
 
 #define cpu_relax_lowlatency() cpu_relax()
 
-/* Stop speculative execution and prefetching of modified code. */
+/*
+ * This function forces the icache and prefetched instruction stream to
+ * catch up with reality in two very specific cases:
+ *
+ *  a) Text was modified using one virtual address and is about to be executed
+ * from the same physical page at a different virtual address.
+ *
+ *  b) Text was modified on a different CPU, may subsequently be
+ * executed on this CPU, and you want to make sure the new version
+ * gets executed.  This generally means you're calling this in a IPI.
+ *
+ * If you're calling this for a different reason, you're probably doing
+ * it wrong.
+ */
 static inline void sync_core(void)
 {
-   int tmp;
-
-#ifdef CONFIG_X86_32
/*
-* Do a CPUID if available, otherwise do a jump.  The jump
-* can conveniently enough be the jump around CPUID.
+* There are quite a few ways to do this.  IRET-to-self is nice
+* because it works on every CPU, at any CPL (so it's compatible
+* with paravirtualization), and it never exits to a hypervisor.
+* The only down sides are that it's a bit slow (it seems to be
+* a bit more than 2x slower than the fastest options) and that
+* it unmasks NMIs.  The "push %cs" is needed because, in
+* paravirtual environments, __KERNEL_CS may not be a valid CS
+* value when we do IRET directly.
+*
+* In case NMI unmasking or performance ever becomes a problem,
+* the next best option appears to be MOV-to-CR2 and an
+* unconditional jump.  That sequence also works on all CPUs,
+* but it will fault at CPL3 (i.e. Xen PV and lguest).
+*
+* CPUID is the conventional way, but it's nasty: it doesn't
+* exist on some 486-like CPUs, and it usually exits to a
+* hypervisor.
+*
+* Like all of Linux's memory ordering operations, this is a
+* compiler barrier as well.
 */
-   asm volatile("cmpl %2,%1\n\t"
-"jl 1f\n\t"
-"cpuid\n"
-"1:"
-: "=a" (tmp)
-: "rm" (boot_cpu_data.cpuid_level), "ri" (0), "0" (1)
-: "ebx", "ecx", "edx", "memory");
+   register void *__sp asm(_ASM_SP);
+
+#ifdef CONFIG_X86_32
+   asm volatile (
+   "pushfl\n\t"
+   "pushl %%cs\n\t"
+   "pushl $1f\n\t"
+   "iret\n\t"
+   "1:"
+   : "+r" (__sp) : : "memory");
 #else
-   /*
-* CPUID is a barrier to speculative execution.
-* Prefetched instructions are automatically
-* invalidated when modified.
-*/
-   asm volatile("cpuid"
-: "=a" (tmp)
-: "0" (1)
-: "ebx", "ecx", "edx", "memory");
+   unsigned int tmp;
+
+   asm volatile (
+   "mov %%ss, %0\n\t"
+   "pushq %q0\n\t"
+   "pushq %%rsp\n\t"
+   "addq $8, (%%rsp)\n\t"
+   "pushfq\n\t"
+   "mov %%cs, %0\n\t"
+   "pushq %q0\n\t"
+   "pushq $1f\n\t"
+   "iretq\n\t"
+   "1:"
+   : "=&r" (tmp), "+r" (__sp) : : "cc", "memory");
 #endif
 }
 
-- 
2.9.3


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v4 0/4] CPUID-less CPU/sync_core fixes and improvements

2016-12-15 Thread Andy Lutomirski
On Fri, Dec 9, 2016 at 10:24 AM, Andy Lutomirski  wrote:
> *** PATCHES 1 and 2 MAY BE 4.9 MATERIAL ***
>
> Alan Cox pointed out that the 486 isn't the only supported CPU that
> doesn't have CPUID.  Let's clean up the mess and make everything
> faster while we're at it.
>

Ping?  Any chance of getting these in for 4.10?

--Andy

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v5 2/3] x86: Remap GDT tables in the Fixmap section

2017-03-09 Thread Andy Lutomirski
On Mon, Mar 6, 2017 at 2:03 PM, Thomas Garnier  wrote:
> Each processor holds a GDT in its per-cpu structure. The sgdt
> instruction gives the base address of the current GDT. This address can
> be used to bypass KASLR memory randomization. With another bug, an
> attacker could target other per-cpu structures or deduce the base of
> the main memory section (PAGE_OFFSET).
>
> This patch relocates the GDT table for each processor inside the
> Fixmap section. The space is reserved based on number of supported
> processors.
>
> For consistency, the remapping is done by default on 32 and 64-bit.
>
> Each processor switches to its remapped GDT at the end of
> initialization. For hibernation, the main processor returns with the
> original GDT and switches back to the remapping at completion.
>
> This patch was tested on both architectures. Hibernation and KVM were
> both tested specially for their usage of the GDT.

Looks good with minor nitpicks.  Also, have you tested on Xen PV?

(If you aren't set up for it, virtme can do this test quite easily.  I
could run it for you if you like, too.)

> +static inline unsigned long get_current_gdt_rw_vaddr(void)
> +{
> +   return (unsigned long)get_current_gdt_rw();
> +}

This has no callers, so let's remove it.

> +static inline unsigned long get_cpu_gdt_ro_vaddr(int cpu)
> +{
> +   return (unsigned long)get_cpu_gdt_ro(cpu);
> +}

Ditto.

> +static inline unsigned long get_current_gdt_ro_vaddr(void)
> +{
> +   return (unsigned long)get_current_gdt_ro();
> +}

Ditto.

> --- a/arch/x86/xen/enlighten.c
> +++ b/arch/x86/xen/enlighten.c
> @@ -710,7 +710,7 @@ static void load_TLS_descriptor(struct thread_struct *t,
>
> *shadow = t->tls_array[i];
>
> -   gdt = get_cpu_gdt_table(cpu);
> +   gdt = get_cpu_gdt_rw(cpu);
> maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
> mc = __xen_mc_entry(0);

Boris, is this right?  I don't see why it wouldn't be, but Xen is special.

> @@ -504,7 +504,7 @@ void __init lguest_arch_host_init(void)
>  * byte, not the size, hence the "-1").
>  */
> state->host_gdt_desc.size = GDT_SIZE-1;
> -   state->host_gdt_desc.address = (long)get_cpu_gdt_table(i);
> +   state->host_gdt_desc.address = (long)get_cpu_gdt_rw(i);

I suspect this should be get_cpu_gdt_ro(), but I don't know too much
about lguest.  Hmm, maybe the right thing to do is to give lguest a
nice farewell and retire it.  The last time I tried to test it, I gave
up.


--Andy

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v5 3/3] x86: Make the GDT remapping read-only on 64-bit

2017-03-09 Thread Andy Lutomirski
On Mon, Mar 6, 2017 at 2:03 PM, Thomas Garnier  wrote:
> This patch makes the GDT remapped pages read-only to prevent corruption.
> This change is done only on 64-bit.
>
> The native_load_tr_desc function was adapted to correctly handle a
> read-only GDT. The LTR instruction always writes to the GDT TSS entry.
> This generates a page fault if the GDT is read-only. This change checks
> if the current GDT is a remap and swap GDTs as needed. This function was
> tested by booting multiple machines and checking hibernation works
> properly.
>
> KVM SVM and VMX were adapted to use the writeable GDT. On VMX, the
> per-cpu variable was removed for functions to fetch the original GDT.
> Instead of reloading the previous GDT, VMX will reload the fixmap GDT as
> expected. For testing, VMs were started and restored on multiple
> configurations.

I like this patch.

> +
> +/*
> + * The LTR instruction marks the TSS GDT entry as busy. On 64-bit, the GDT is
> + * a read-only remapping. To prevent a page fault, the GDT is switched to the
> + * original writeable version when needed.
> + */
> +#ifdef CONFIG_X86_64
>  static inline void native_load_tr_desc(void)
>  {
> +   struct desc_ptr gdt;
> +   int cpu = raw_smp_processor_id();
> +   bool restore = 0;
> +   struct desc_struct *fixmap_gdt;
> +
> +   native_store_gdt(&gdt);

This part will slow this function down considerably, but with the
recent KVM improvements, I think that there are no callers left that
care about performance, so this should be fine.

--Andy

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v5 2/3] x86: Remap GDT tables in the Fixmap section

2017-03-09 Thread Andy Lutomirski
On Thu, Mar 9, 2017 at 1:43 PM, Andrew Cooper  wrote:
> On 09/03/2017 21:32, Andy Lutomirski wrote:
>> On Mon, Mar 6, 2017 at 2:03 PM, Thomas Garnier  wrote:
>>
>>> --- a/arch/x86/xen/enlighten.c
>>> +++ b/arch/x86/xen/enlighten.c
>>> @@ -710,7 +710,7 @@ static void load_TLS_descriptor(struct thread_struct *t,
>>>
>>> *shadow = t->tls_array[i];
>>>
>>> -   gdt = get_cpu_gdt_table(cpu);
>>> +   gdt = get_cpu_gdt_rw(cpu);
>>> maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
>>> mc = __xen_mc_entry(0);
>> Boris, is this right?  I don't see why it wouldn't be, but Xen is special.
>
> Under Xen PV, the GDT is already read-only at this point.  (It is not
> safe to let the guest have writeable access to system tables, so the
> guest must relinquish write access to the frames wishing to be used as
> LDTs or GDTs.)
>
> The hypercall acts on the frame, not a virtual address, so either alias
> should be fine here.
>
> Under this new scheme, there will be two read-only aliases.  I guess
> this is easier to maintain the split consistently across Linux, than to
> special case Xen PV because it doesn't need the second alias.
>

I think we would gain nothing at all by special-casing Xen PV -- Linux
allocates the fixmap vaddrs at compile time, so we'd still allocate
them even if we rejigger all the helpers to avoid using them.

--Andy

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH] xen/x86: Adjust stack pointer in xen_sysexit

2015-11-13 Thread Andy Lutomirski
On Fri, Nov 13, 2015 at 3:18 PM, Boris Ostrovsky
 wrote:
> After 32-bit syscall rewrite, and specifically after commit 5f310f739b4c
> ("x86/entry/32: Re-implement SYSENTER using the new C path"), the stack
> frame that is passed to xen_sysexit is no longer a "standard" one (i.e.
> it's not pt_regs).
>
> We need to adjust it so that subsequent xen_iret can use it.

I'm wondering if this should be more straightforward:

movq%rsp, %rdi
calldo_fast_syscall_32
testl   %eax, %eax
jz  .Lsyscall_32_done

/* Opportunistic SYSRET */
sysret32_from_system_call:
XEN_DO_SYSRET32

where XEN_DO_SYSRET32 is a simple pv op that, on Xen, jumps to a
variant of Xen's iret path that knows that the fast path is okay.

--Andy

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH] xen/x86: Adjust stack pointer in xen_sysexit

2015-11-15 Thread Andy Lutomirski
On Nov 13, 2015 5:23 PM, "Boris Ostrovsky"  wrote:
>
>
>
> On 11/13/2015 06:26 PM, Andy Lutomirski wrote:
>>
>> On Fri, Nov 13, 2015 at 3:18 PM, Boris Ostrovsky
>>  wrote:
>>>
>>> After 32-bit syscall rewrite, and specifically after commit 5f310f739b4c
>>> ("x86/entry/32: Re-implement SYSENTER using the new C path"), the stack
>>> frame that is passed to xen_sysexit is no longer a "standard" one (i.e.
>>> it's not pt_regs).
>>>
>>> We need to adjust it so that subsequent xen_iret can use it.
>>
>> I'm wondering if this should be more straightforward:
>>
>>  movq%rsp, %rdi
>>  calldo_fast_syscall_32
>>  testl   %eax, %eax
>>  jz  .Lsyscall_32_done
>>
>>  /* Opportunistic SYSRET */
>> sysret32_from_system_call:
>>  XEN_DO_SYSRET32
>>
>> where XEN_DO_SYSRET32 is a simple pv op that, on Xen, jumps to a
>> variant of Xen's iret path that knows that the fast path is okay.
>
>
>
> This patch is for 32-bit kernel. I actually haven't looked at compat code 
> (probably because our tests don't try that), I need to do that too.

In 4.4, it's almost identical (which was part of the point of this
whole series).  We use sysret32 instead of sysexit, but the underlying
structure is the same: munge the stack frame and register state
appropriately to use the fast return instruction in question and then
execute it.  In both cases, the only real difference from the IRET
path is that we're willing to lose the values of some subset of cx,
dx, and (on 64-bit kernels) r11.

>
> As for XEN_DO_SYSRET32 --- we'd presumably need to have a nop for baremetal 
> otherwise current paravirt op will use native_usergs_sysret32 (for compat 
> code). Which means a new pv_op, I think.

Agreed, unless...

Does Xen have a cpufeature?  Using ALTERNATIVE instead of a pvop could
be easier to follow and be less code at the same time.  Frankly,
following the control flow from asm through the pre-paravirt-patching
and post-paravirt-patching variants and into the final targets is
getting a little bit old, and ALTERNATIVE is crystal clear in
comparison (and has all the interesting info inline with the rest of
the asm).  Of course, it doesn't work early in boot, but that's fine
for anything involving user/kernel switches.


--Andy

>
> -boris

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH] xen/x86: Adjust stack pointer in xen_sysexit

2015-11-16 Thread Andy Lutomirski
On Mon, Nov 16, 2015 at 8:25 AM, Boris Ostrovsky
 wrote:
> On 11/15/2015 01:02 PM, Andy Lutomirski wrote:
>>
>> On Nov 13, 2015 5:23 PM, "Boris Ostrovsky" 
>> wrote:
>>>
>>>
>>>
>>> On 11/13/2015 06:26 PM, Andy Lutomirski wrote:
>>>>
>>>> On Fri, Nov 13, 2015 at 3:18 PM, Boris Ostrovsky
>>>>  wrote:
>>>>>
>>>>> After 32-bit syscall rewrite, and specifically after commit
>>>>> 5f310f739b4c
>>>>> ("x86/entry/32: Re-implement SYSENTER using the new C path"), the stack
>>>>> frame that is passed to xen_sysexit is no longer a "standard" one (i.e.
>>>>> it's not pt_regs).
>>>>>
>>>>> We need to adjust it so that subsequent xen_iret can use it.
>>>>
>>>> I'm wondering if this should be more straightforward:
>>>>
>>>>   movq%rsp, %rdi
>>>>   calldo_fast_syscall_32
>>>>   testl   %eax, %eax
>>>>   jz  .Lsyscall_32_done
>>>>
>>>>   /* Opportunistic SYSRET */
>>>> sysret32_from_system_call:
>>>>   XEN_DO_SYSRET32
>>>>
>>>> where XEN_DO_SYSRET32 is a simple pv op that, on Xen, jumps to a
>>>> variant of Xen's iret path that knows that the fast path is okay.
>>>
>>>
>>>
>>> This patch is for 32-bit kernel. I actually haven't looked at compat code
>>> (probably because our tests don't try that), I need to do that too.
>>
>> In 4.4, it's almost identical (which was part of the point of this
>> whole series).  We use sysret32 instead of sysexit, but the underlying
>> structure is the same: munge the stack frame and register state
>> appropriately to use the fast return instruction in question and then
>> execute it.  In both cases, the only real difference from the IRET
>> path is that we're willing to lose the values of some subset of cx,
>> dx, and (on 64-bit kernels) r11.
>
>
>
> So it turned out that for compat mode we don't need to do anything since
> xen_sysret32 doesn't assume any stack format (or, rather, it assumes that it
> can't be used) and builds the IRET frame itself.
>

It's still a waste of effort, though.  Also, I'd eventually like the
number of places in Xen code in which rsp/esp is invalid to be exactly
zero, and this approach makes this harder or even impossible.

>
>>
>>> As for XEN_DO_SYSRET32 --- we'd presumably need to have a nop for
>>> baremetal otherwise current paravirt op will use native_usergs_sysret32 (for
>>> compat code). Which means a new pv_op, I think.
>>
>> Agreed, unless...
>>
>> Does Xen have a cpufeature?  Using ALTERNATIVE instead of a pvop could
>> be easier to follow and be less code at the same time.  Frankly,
>> following the control flow from asm through the pre-paravirt-patching
>> and post-paravirt-patching variants and into the final targets is
>> getting a little bit old, and ALTERNATIVE is crystal clear in
>> comparison (and has all the interesting info inline with the rest of
>> the asm).  Of course, it doesn't work early in boot, but that's fine
>> for anything involving user/kernel switches.
>
>
>
> We don't currently have a Xen-specific CPU feature. We could, in principle,
> add it but we can't replace all of current paravirt patching with a single
> feature since PVH guests use a subset of existing pv ops (and in the future
> it may become even more fine-grained).
>
> And I don't think we should go ALTERNATIVE route for one set of features and
> keep pv ops for the rest --- it should be either one or the other.

Does PVH hook into the entry asm code at all?  I thought it was just
boot code and drivers.

In any case, someone needs to do some serious review and cleanup on
the whole paravirt op mess.  We have a bunch of paravirt ops that
serve little purpose.

The paravirt infrastructure is a bit weird, too: it seems to
effectively have four states for each patch site.  There's:

1. The initial state, which is unoptimized and works on native.
Presumably any of these that happen early also need to work, if
slowly, on Xen.

2. The Xen state without text patching.  I'm not actually sure why
this exists at all.  Are there pvops that need to switch too early for
us to patch the text?

3. The native patched state.  This is supposedly optimal, but it
results in a few more NOPs than are really needed.

4. The Xen patched state.

Alternatives have only two states, and the code

Re: [Xen-devel] [PATCH] xen/x86: Adjust stack pointer in xen_sysexit

2015-11-16 Thread Andy Lutomirski
On Mon, Nov 16, 2015 at 11:59 AM, Borislav Petkov  wrote:
> On Mon, Nov 16, 2015 at 11:03:22AM -0800, Andy Lutomirski wrote:
>
>> ...
>> The reader surely doesn't remember that this isn't guaranteed to be a
>> swapgs instruction on native.  Using:
>>
>> ALTERNATIVE "swapgs" "" X86_FEATURE_XENPV
>>
>> would be safer (it would get rid of the SWAPGS_UNSAFE_STACK mess) and
>> much clearer.  We could hide *that* behind a macro and no one would be
>> confused.  (Well, they'd be confused by the fact that Xen PV handles
>> gsbase very differently from native, but that has nothing to do with
>> the macro.)
>>
>> I think we could convert piecemeal, and I wonder if this new patch for
>> 32-bit native on 4.4 (this is needed for 4.4, right?) would be a good
>> starting point.  Borislav, what do you think?  Would you be okay with
>> adding a Xen PV pseudofeature?
>
> AFAICT, I'd prefer this becomes rather a jump label which gets enabled
> on xen. Especially if a single pseudofeature might not be enough,
> apprently...

Except it's not a jump.  (Also, the alternatives infrastructure is IMO
much nicer than the jump label infrastructure.)

Taking SWAPGS as an example, the semantics we need are:

 - On native, do swapgs.  This *can't* be a call due to RSP issues.
 - On Xen PV, swapgs will work, but it's emulated.  We'd rather just nop it out.

In principle, we could static jump over it on Xen, but that also
involves forcing the jump label to be built on old GCC versions, which
PeterZ objected to the last time I asked.

If it would make you feel better, it could be X86_BUG_XENPV :-p

Are there really multiple feature bits for this stuff?  I'd like to
imagine that the entry code is all either Xen PV or native/PVH/PVHVM
-- i.e. I assumed that PVH works like native for all entries.

--Andy tip #101: Trim your mails when you reply.



-- 
Andy Lutomirski
AMA Capital Management, LLC

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH] xen/x86: Adjust stack pointer in xen_sysexit

2015-11-16 Thread Andy Lutomirski
On Mon, Nov 16, 2015 at 12:48 PM, Boris Ostrovsky
 wrote:
> On 11/16/2015 03:22 PM, Borislav Petkov wrote:
>>
>> On Mon, Nov 16, 2015 at 12:11:11PM -0800, Andy Lutomirski wrote:
>>
>>> Are there really multiple feature bits for this stuff?  I'd like to
>>> imagine that the entry code is all either Xen PV or native/PVH/PVHVM
>>> -- i.e. I assumed that PVH works like native for all entries.
>
>
>
> Almost. For PVH we will have a small stub to set up bootparams and such but
> then we jump to startup_{32|64} code.
>
>
>> I just reacted to Boris' statement:
>>
>> "We don't currently have a Xen-specific CPU feature. We could, in
>> principle, add it but we can't replace all of current paravirt patching
>> with a single feature since PVH guests use a subset of existing pv ops
>> (and in the future it may become even more fine-grained)."
>
>
> Actually, nevermind this --- I was thinking about APIC ops and they are not
> pv ops.
>
> Note though that there are other users of pv ops --- lguest and looks like
> KVM (for one op) use them too.
>

Honestly, I think we should just delete lguest some time soon.  And
KVM uses this stuff so lightly that we shouldn't need all of the pvop
stuff.  (In fact, I'm slowly working on removing KVM_GUEST's
dependency on PARAVIRT.)

--Andy


-- 
Andy Lutomirski
AMA Capital Management, LLC

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH] xen/x86: Adjust stack pointer in xen_sysexit

2015-11-16 Thread Andy Lutomirski
On Mon, Nov 16, 2015 at 1:03 PM, Konrad Rzeszutek Wilk
 wrote:
> On Mon, Nov 16, 2015 at 12:50:19PM -0800, Andy Lutomirski wrote:
>> On Mon, Nov 16, 2015 at 12:48 PM, Boris Ostrovsky
>>  wrote:
>> > On 11/16/2015 03:22 PM, Borislav Petkov wrote:
>> >>
>> >> On Mon, Nov 16, 2015 at 12:11:11PM -0800, Andy Lutomirski wrote:
>> >>
>> >>> Are there really multiple feature bits for this stuff?  I'd like to
>> >>> imagine that the entry code is all either Xen PV or native/PVH/PVHVM
>> >>> -- i.e. I assumed that PVH works like native for all entries.
>> >
>> >
>> >
>> > Almost. For PVH we will have a small stub to set up bootparams and such but
>> > then we jump to startup_{32|64} code.
>> >
>> >
>> >> I just reacted to Boris' statement:
>> >>
>> >> "We don't currently have a Xen-specific CPU feature. We could, in
>> >> principle, add it but we can't replace all of current paravirt patching
>> >> with a single feature since PVH guests use a subset of existing pv ops
>> >> (and in the future it may become even more fine-grained)."
>> >
>> >
>> > Actually, nevermind this --- I was thinking about APIC ops and they are not
>> > pv ops.
>> >
>> > Note though that there are other users of pv ops --- lguest and looks like
>> > KVM (for one op) use them too.
>> >
>>
>> Honestly, I think we should just delete lguest some time soon.  And
>> KVM uses this stuff so lightly that we shouldn't need all of the pvop
>> stuff.  (In fact, I'm slowly working on removing KVM_GUEST's
>> dependency on PARAVIRT.)
>
> Even for the pvclock?
>
> (sorry for stealing this thread on this subject).

I don't think that pvclock uses any of the paravirt infrastructure.
It's just another clock source AFAIK.

--Andy

-- 
Andy Lutomirski
AMA Capital Management, LLC

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH] xen/x86: Adjust stack pointer in xen_sysexit

2015-11-17 Thread Andy Lutomirski
On Nov 17, 2015 6:40 AM, "Boris Ostrovsky"  wrote:
>
> On 11/16/2015 04:55 PM, H. Peter Anvin wrote:
>>
>> On 11/16/15 12:22, Borislav Petkov wrote:
>>>
>>> Huh, so what's wrong with a jump:
>>>
>>> jmp 1f
>>> swapgs
>>> 1:
>>>
>> What is the point of that jump?
>>
 If it would make you feel better, it could be X86_BUG_XENPV :-p
>>>
>>> That doesn't matter - I just don't want to open the flood gates on
>>> pseudo feature bits.
>>>
>>> hpa, what do you think?
>>
>> Pseudo feature bits are fine, we already have plenty of them.  They make
>> sense as they let us reuse a lot of infrastructure.
>
>
>
> So how about something like this? And then I think we can remove 
> usergs_sysret32 and irq_enable_sysexit pv ops completely as noone will use 
> them (lguest doesn't set them)
>

Looks good to me.  Does Xen have any sysexit/sysret32 equivalent to
return to 32-bit user mode?  If so, it could be worth trying to wire
it up by patching the jz instead of the test instruction.

Also, I'd prefer X86_FEATURE_XENPV.  IMO "PV" means too many things to
too many people.

--Andy

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH] xen/x86: Adjust stack pointer in xen_sysexit

2015-11-17 Thread Andy Lutomirski
On Tue, Nov 17, 2015 at 11:12 AM, Andrew Cooper
 wrote:
> On 17/11/15 18:49, Andy Lutomirski wrote:
>> On Nov 17, 2015 6:40 AM, "Boris Ostrovsky"  
>> wrote:
>>> On 11/16/2015 04:55 PM, H. Peter Anvin wrote:
>>>> On 11/16/15 12:22, Borislav Petkov wrote:
>>>>> Huh, so what's wrong with a jump:
>>>>>
>>>>> jmp 1f
>>>>> swapgs
>>>>> 1:
>>>>>
>>>> What is the point of that jump?
>>>>
>>>>>> If it would make you feel better, it could be X86_BUG_XENPV :-p
>>>>> That doesn't matter - I just don't want to open the flood gates on
>>>>> pseudo feature bits.
>>>>>
>>>>> hpa, what do you think?
>>>> Pseudo feature bits are fine, we already have plenty of them.  They make
>>>> sense as they let us reuse a lot of infrastructure.
>>>
>>>
>>> So how about something like this? And then I think we can remove 
>>> usergs_sysret32 and irq_enable_sysexit pv ops completely as noone will use 
>>> them (lguest doesn't set them)
>>>
>> Looks good to me.  Does Xen have any sysexit/sysret32 equivalent to
>> return to 32-bit user mode?  If so, it could be worth trying to wire
>> it up by patching the jz instead of the test instruction.
>
> From the guests point of view, there is only hypercall_iret.

Doesn't hypercall_iret have flags that ask for different behavior,
though?  (VG_syscall or whatever for the 64-bit case?)

>
>>
>> Also, I'd prefer X86_FEATURE_XENPV.  IMO "PV" means too many things to
>> too many people.
>
> I agree - PV on its own is too generic.
>
> An alternative might be X86_FEATURE_XEN_PV_GUEST which is very clear an
> unambiguous, although rather longer.

Works for me, too, although seeing "xen_pv_host" in the Linux cpu
features would be very strange indeed :)

--Andy

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH] xen/x86: Adjust stack pointer in xen_sysexit

2015-11-17 Thread Andy Lutomirski
On Tue, Nov 17, 2015 at 11:29 AM, Andrew Cooper
 wrote:
> On 17/11/15 19:16, Andy Lutomirski wrote:
>> On Tue, Nov 17, 2015 at 11:12 AM, Andrew Cooper
>>  wrote:
>>> On 17/11/15 18:49, Andy Lutomirski wrote:
>>>> On Nov 17, 2015 6:40 AM, "Boris Ostrovsky"  
>>>> wrote:
>>>>> On 11/16/2015 04:55 PM, H. Peter Anvin wrote:
>>>>>> On 11/16/15 12:22, Borislav Petkov wrote:
>>>>>>> Huh, so what's wrong with a jump:
>>>>>>>
>>>>>>> jmp 1f
>>>>>>> swapgs
>>>>>>> 1:
>>>>>>>
>>>>>> What is the point of that jump?
>>>>>>
>>>>>>>> If it would make you feel better, it could be X86_BUG_XENPV :-p
>>>>>>> That doesn't matter - I just don't want to open the flood gates on
>>>>>>> pseudo feature bits.
>>>>>>>
>>>>>>> hpa, what do you think?
>>>>>> Pseudo feature bits are fine, we already have plenty of them.  They make
>>>>>> sense as they let us reuse a lot of infrastructure.
>>>>>
>>>>> So how about something like this? And then I think we can remove 
>>>>> usergs_sysret32 and irq_enable_sysexit pv ops completely as noone will 
>>>>> use them (lguest doesn't set them)
>>>>>
>>>> Looks good to me.  Does Xen have any sysexit/sysret32 equivalent to
>>>> return to 32-bit user mode?  If so, it could be worth trying to wire
>>>> it up by patching the jz instead of the test instruction.
>>> From the guests point of view, there is only hypercall_iret.
>> Doesn't hypercall_iret have flags that ask for different behavior,
>> though?  (VG_syscall or whatever for the 64-bit case?)
>
> The one and only flag is VGCF_in_syscall
>
> Xen has its own logic for choosing between sysretq/sysretl if
> VGCF_in_syscall, but will end up on an iret path in all other
> circumstances.

In that case, a nicer version of this patch could preserve the new
sysret-or-iret decision (on 64-bit kernels in the compat path) and use
it to set VGCF_in_syscall.  This might work considerably better now
than it ever would have, since Linux now tries to use sysret32 on
*all* 64-bit CPUs, and the way it's structured for compat is just a
flag (the testl %eax,%eax thing) that indicates that sysret32 is okay.

Anyway, that can be a followup patch.

--Andy

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH 1/3] x86/xen: Avoid fast syscall path for Xen PV guests

2015-11-18 Thread Andy Lutomirski
On Wed, Nov 18, 2015 at 12:06 PM, Boris Ostrovsky
 wrote:
> After 32-bit syscall rewrite, and specifically after commit 5f310f739b4c
> ("x86/entry/32: Re-implement SYSENTER using the new C path"), the stack
> frame that is passed to xen_sysexit is no longer a "standard" one (i.e.
> it's not pt_regs).
>
> Since we end up calling xen_iret from xen_sysexit we don't need to fix
> up the stack and instead follow entry_SYSENTER_32's IRET path directly
> to xen_iret.
>
> We can do the same thing for compat mode even though stack does not need
> to be fixed. This will allow us to drop usergs_sysret32 paravirt op (in
> the subsequent patch)

Looks generally quite nice.  Minor comments below:

> --- a/arch/x86/entry/entry_32.S
> +++ b/arch/x86/entry/entry_32.S
> @@ -308,7 +308,8 @@ sysenter_past_esp:
>
> movl%esp, %eax
> calldo_fast_syscall_32
> -   testl   %eax, %eax
> +   /* XEN PV guests always use IRET path */
> +   ALTERNATIVE "testl %eax, %eax", "xor %eax, %eax", X86_FEATURE_XENPV
> jz  .Lsyscall_32_done

Could we make this a little less subtle:

ALTERNATIVE "testl %eax, %eax; lz .Lsyscall_32_done", "jmp
.Lsyscasll_32_done", X86_FEATURE_XENPV

Borislav, what do you think?

Ditto for the others.

> diff --git a/arch/x86/include/asm/cpufeature.h 
> b/arch/x86/include/asm/cpufeature.h
> index e4f8010..0e4fe5b 100644
> --- a/arch/x86/include/asm/cpufeature.h
> +++ b/arch/x86/include/asm/cpufeature.h
> @@ -216,6 +216,7 @@
>  #define X86_FEATURE_PAUSEFILTER ( 8*32+13) /* AMD filtered pause intercept */
>  #define X86_FEATURE_PFTHRESHOLD ( 8*32+14) /* AMD pause filter threshold */
>  #define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer vmmcall to vmcall */
> +#define X86_FEATURE_XENPV   ( 8*32+16) /* Xen paravirtual guest */
>

This bit is highly magical and I think we need Borislav's ack.

--Andy

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH 2/3] x86: irq_enable_sysexit pv op is no longer needed

2015-11-18 Thread Andy Lutomirski
On Wed, Nov 18, 2015 at 12:06 PM, Boris Ostrovsky
 wrote:
> Xen PV guests have been the only ones using it and now they don't.

Fantastic!

--Andy

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


  1   2   3   4   >