Re: [RFC v2 1/6] x86: introduce kernel restartable sequence

2019-01-03 Thread hpa
On December 30, 2018 11:21:07 PM PST, Nadav Amit  wrote:
>It is sometimes beneficial to have a restartable sequence - very few
>instructions which if they are preempted jump to a predefined point.
>
>To provide such functionality on x86-64, we use an empty REX-prefix
>(opcode 0x40) as an indication for instruction in such a sequence.
>Before
>calling the schedule IRQ routine, if the "magic" prefix is found, we
>call a routine to adjust the instruction pointer.  It is expected that
>this opcode is not in common use.
>
>The following patch will make use of this function. Since there are no
>other users (yet?), the patch does not bother to create a general
>infrastructure and API that others can use for such sequences. Yet, it
>should not be hard to make such extension later.
>
>Signed-off-by: Nadav Amit 
>---
> arch/x86/entry/entry_64.S| 16 ++--
> arch/x86/include/asm/nospec-branch.h | 12 
> arch/x86/kernel/traps.c  |  7 +++
> 3 files changed, 33 insertions(+), 2 deletions(-)
>
>diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
>index 1f0efdb7b629..e144ff8b914f 100644
>--- a/arch/x86/entry/entry_64.S
>+++ b/arch/x86/entry/entry_64.S
>@@ -644,12 +644,24 @@ retint_kernel:
>   /* Interrupts are off */
>   /* Check if we need preemption */
>   btl $9, EFLAGS(%rsp)/* were interrupts off? */
>-  jnc 1f
>+  jnc 2f
> 0:cmpl$0, PER_CPU_VAR(__preempt_count)
>+  jnz 2f
>+
>+  /*
>+   * Allow to use restartable code sections in the kernel. Consider an
>+   * instruction with the first byte having REX prefix without any bits
>+   * set as an indication for an instruction in such a section.
>+   */
>+  movqRIP(%rsp), %rax
>+  cmpb$KERNEL_RESTARTABLE_PREFIX, (%rax)
>   jnz 1f
>+  mov %rsp, %rdi
>+  callrestart_kernel_rseq
>+1:
>   callpreempt_schedule_irq
>   jmp 0b
>-1:
>+2:
> #endif
>   /*
>* The iretq could re-enable interrupts:
>diff --git a/arch/x86/include/asm/nospec-branch.h
>b/arch/x86/include/asm/nospec-branch.h
>index dad12b767ba0..be4713ef0940 100644
>--- a/arch/x86/include/asm/nospec-branch.h
>+++ b/arch/x86/include/asm/nospec-branch.h
>@@ -54,6 +54,12 @@
>   jnz 771b;   \
>   add $(BITS_PER_LONG/8) * nr, sp;
> 
>+/*
>+ * An empty REX-prefix is an indication that this instruction is part
>of kernel
>+ * restartable sequence.
>+ */
>+#define KERNEL_RESTARTABLE_PREFIX (0x40)
>+
> #ifdef __ASSEMBLY__
> 
> /*
>@@ -150,6 +156,12 @@
> #endif
> .endm
> 
>+.macro restartable_seq_prefix
>+#ifdef CONFIG_PREEMPT
>+  .byte   KERNEL_RESTARTABLE_PREFIX
>+#endif
>+.endm
>+
> #else /* __ASSEMBLY__ */
> 
> #define ANNOTATE_NOSPEC_ALTERNATIVE   \
>diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
>index 85cccadb9a65..b1e855bad5ac 100644
>--- a/arch/x86/kernel/traps.c
>+++ b/arch/x86/kernel/traps.c
>@@ -59,6 +59,7 @@
> #include 
> #include 
> #include 
>+#include 
> 
> #ifdef CONFIG_X86_64
> #include 
>@@ -186,6 +187,12 @@ int fixup_bug(struct pt_regs *regs, int trapnr)
>   return 0;
> }
> 
>+asmlinkage __visible void restart_kernel_rseq(struct pt_regs *regs)
>+{
>+  if (user_mode(regs) || *(u8 *)regs->ip != KERNEL_RESTARTABLE_PREFIX)
>+  return;
>+}
>+
> static nokprobe_inline int
>do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str,
> struct pt_regs *regs, long error_code)

A 0x40 prefix is *not* a noop. It changes the interpretation of byte registers 
4 though 7 from ah, ch, dh, bh to spl, bpl, sil and dil.

It may not matter in your application but:

a. You need to clarify that so is the case, and why;
b. Phrase it differently so others don't propagate the same misunderstanding.
-- 
Sent from my Android device with K-9 Mail. Please excuse my brevity.


Re: [RFC v2 1/6] x86: introduce kernel restartable sequence

2019-01-03 Thread Nadav Amit
> On Jan 3, 2019, at 3:40 PM, Andi Kleen  wrote:
> 
>> Thanks for the explanations. I don’t think it would work (e.g., IRQs). I can
>> avoid generalizing and just detect the "magic sequence” of the code, but let
>> me give it some more thought.
> 
> If you ask me I would just use compiler profile feedback or autofdo (if your
> compiler has a working version)
> 
> The compiler can do a much better job at optimizing this than you ever could.
> 
> Manual FDO needs some kernel patching though.

As long as you are willing to profile and build your own kernels. (Profiling
will not tell you if users are about to use IPv4/6).

I also don’t see how FDO can support modules.



Re: [RFC v2 1/6] x86: introduce kernel restartable sequence

2019-01-03 Thread Andi Kleen
> Thanks for the explanations. I don’t think it would work (e.g., IRQs). I can
> avoid generalizing and just detect the "magic sequence” of the code, but let
> me give it some more thought.

If you ask me I would just use compiler profile feedback or autofdo (if your
compiler has a working version)

The compiler can do a much better job at optimizing this than you ever could.

Manual FDO needs some kernel patching though.

-Andi



Re: [RFC v2 1/6] x86: introduce kernel restartable sequence

2019-01-03 Thread Nadav Amit
> On Jan 3, 2019, at 2:48 PM, Andi Kleen  wrote:
> 
>> Ok… I’ll try to think about another solution. Just note that this is just
>> used as a hint to avoid unnecessary lookups. (IOW, nothing will break if the
>> prefix is used.)
> 
> Are you sure actually? 
> 
> The empty prefix could mean 8bit register accesses.
> 
>>> You're doing the equivalent of patching a private system call
>>> into your own kernel without working with upstream, don't do that.
>> 
>> I don’t understand this comment though. Can you please explain?
> 
> Instruction encoding = system call ABI
> Upstream = CPU vendors
> 
> Early in Linux's history, naive Linux distribution vendors patched in their 
> own
> private system calls without waiting for upstream to define an ABI, which 
> caused
> endless compatibility problems. These days this is very frowned upon.
> 
>>> Better to find some other solution to do the restart.
>>> How about simply using a per cpu variable? That should be cheaper
>>> anyways.
>> 
>> The problem is that the per-cpu variable needs to be updated after the call
>> is executed, when we are already not in the context of the “injected” code.
>> I can increase it before the call, and decrease it after return - but this
>> can create (in theory) long periods in which the code is “unpatchable”,
>> increase the code size and slow performance.
>> 
>> Anyhow, I’ll give more thought. Ideas are welcomed.
> 
> Write the address of the instruction into the per cpu variable.

Thanks for the explanations. I don’t think it would work (e.g., IRQs). I can
avoid generalizing and just detect the "magic sequence” of the code, but let
me give it some more thought.



Re: [RFC v2 1/6] x86: introduce kernel restartable sequence

2019-01-03 Thread Andi Kleen
> Ok… I’ll try to think about another solution. Just note that this is just
> used as a hint to avoid unnecessary lookups. (IOW, nothing will break if the
> prefix is used.)

Are you sure actually? 

The empty prefix could mean 8bit register accesses.

> > You're doing the equivalent of patching a private system call
> > into your own kernel without working with upstream, don't do that.
> 
> I don’t understand this comment though. Can you please explain?

Instruction encoding = system call ABI
Upstream = CPU vendors

Early in Linux's history, naive Linux distribution vendors patched in their own
private system calls without waiting for upstream to define an ABI, which caused
endless compatibility problems. These days this is very frowned upon.

> > Better to find some other solution to do the restart.
> > How about simply using a per cpu variable? That should be cheaper
> > anyways.
> 
> The problem is that the per-cpu variable needs to be updated after the call
> is executed, when we are already not in the context of the “injected” code.
> I can increase it before the call, and decrease it after return - but this
> can create (in theory) long periods in which the code is “unpatchable”,
> increase the code size and slow performance.
> 
> Anyhow, I’ll give more thought. Ideas are welcomed.

Write the address of the instruction into the per cpu variable.

-Andi



Re: [RFC v2 1/6] x86: introduce kernel restartable sequence

2019-01-03 Thread Nadav Amit
> On Jan 3, 2019, at 2:21 PM, Andi Kleen  wrote:
> 
> Nadav Amit  writes:
> 
> I see another poor man's attempt to reinvent TSX.
> 
>> It is sometimes beneficial to have a restartable sequence - very few
>> instructions which if they are preempted jump to a predefined point.
>> 
>> To provide such functionality on x86-64, we use an empty REX-prefix
>> (opcode 0x40) as an indication for instruction in such a sequence. Before
>> calling the schedule IRQ routine, if the "magic" prefix is found, we
>> call a routine to adjust the instruction pointer.  It is expected that
>> this opcode is not in common use.
> 
> You cannot just assume something like that. x86 is a constantly
> evolving architecture. The prefix might well have meaning at
> some point.
> 
> Before doing something like that you would need to ask the CPU
> vendors to reserve the sequence you're using for software use.

Ok… I’ll try to think about another solution. Just note that this is just
used as a hint to avoid unnecessary lookups. (IOW, nothing will break if the
prefix is used.)

> You're doing the equivalent of patching a private system call
> into your own kernel without working with upstream, don't do that.

I don’t understand this comment though. Can you please explain?

> Better to find some other solution to do the restart.
> How about simply using a per cpu variable? That should be cheaper
> anyways.

The problem is that the per-cpu variable needs to be updated after the call
is executed, when we are already not in the context of the “injected” code.
I can increase it before the call, and decrease it after return - but this
can create (in theory) long periods in which the code is “unpatchable”,
increase the code size and slow performance.

Anyhow, I’ll give more thought. Ideas are welcomed.



Re: [RFC v2 1/6] x86: introduce kernel restartable sequence

2019-01-03 Thread Andi Kleen
Nadav Amit  writes:

I see another poor man's attempt to reinvent TSX.

> It is sometimes beneficial to have a restartable sequence - very few
> instructions which if they are preempted jump to a predefined point.
>
> To provide such functionality on x86-64, we use an empty REX-prefix
> (opcode 0x40) as an indication for instruction in such a sequence. Before
> calling the schedule IRQ routine, if the "magic" prefix is found, we
> call a routine to adjust the instruction pointer.  It is expected that
> this opcode is not in common use.

You cannot just assume something like that. x86 is a constantly
evolving architecture. The prefix might well have meaning at
some point.

Before doing something like that you would need to ask the CPU
vendors to reserve the sequence you're using for software use.

You're doing the equivalent of patching a private system call
into your own kernel without working with upstream, don't do that.

Better to find some other solution to do the restart.
How about simply using a per cpu variable? That should be cheaper
anyways.

-Andi


Re: [RFC v2 1/6] x86: introduce kernel restartable sequence

2018-12-31 Thread Nadav Amit
> On Dec 31, 2018, at 12:08 PM, Andy Lutomirski  wrote:
> 
> On Sun, Dec 30, 2018 at 11:20 PM Nadav Amit  wrote:
>> It is sometimes beneficial to have a restartable sequence - very few
>> instructions which if they are preempted jump to a predefined point.
>> 
>> To provide such functionality on x86-64, we use an empty REX-prefix
>> (opcode 0x40) as an indication for instruction in such a sequence. Before
>> calling the schedule IRQ routine, if the "magic" prefix is found, we
>> call a routine to adjust the instruction pointer.  It is expected that
>> this opcode is not in common use.
>> 
>> The following patch will make use of this function. Since there are no
>> other users (yet?), the patch does not bother to create a general
>> infrastructure and API that others can use for such sequences. Yet, it
>> should not be hard to make such extension later.
> 
> The following patch does not use it.  Can you update this?

I will. Sorry for not updating the commit-log. The GCC plugin, and various
requests (that I am not sure I fully agree with) really caused me to spit
some blood.

>> +asmlinkage __visible void restart_kernel_rseq(struct pt_regs *regs)
>> +{
>> +   if (user_mode(regs) || *(u8 *)regs->ip != KERNEL_RESTARTABLE_PREFIX)
>> +   return;
> 
> else?
> 
> I suspect something is missing here.  Or I'm very confused.

Indeed, the code should call optpoline_restart_rseq() (or some other name,
once I fix the naming). I will fix it.



Re: [RFC v2 1/6] x86: introduce kernel restartable sequence

2018-12-31 Thread Andy Lutomirski
On Sun, Dec 30, 2018 at 11:20 PM Nadav Amit  wrote:
>
> It is sometimes beneficial to have a restartable sequence - very few
> instructions which if they are preempted jump to a predefined point.
>
> To provide such functionality on x86-64, we use an empty REX-prefix
> (opcode 0x40) as an indication for instruction in such a sequence. Before
> calling the schedule IRQ routine, if the "magic" prefix is found, we
> call a routine to adjust the instruction pointer.  It is expected that
> this opcode is not in common use.
>
> The following patch will make use of this function. Since there are no
> other users (yet?), the patch does not bother to create a general
> infrastructure and API that others can use for such sequences. Yet, it
> should not be hard to make such extension later.

The following patch does not use it.  Can you update this?


> +asmlinkage __visible void restart_kernel_rseq(struct pt_regs *regs)
> +{
> +   if (user_mode(regs) || *(u8 *)regs->ip != KERNEL_RESTARTABLE_PREFIX)
> +   return;

else?

I suspect something is missing here.  Or I'm very confused.

> +}


[RFC v2 1/6] x86: introduce kernel restartable sequence

2018-12-30 Thread Nadav Amit
It is sometimes beneficial to have a restartable sequence - very few
instructions which if they are preempted jump to a predefined point.

To provide such functionality on x86-64, we use an empty REX-prefix
(opcode 0x40) as an indication for instruction in such a sequence. Before
calling the schedule IRQ routine, if the "magic" prefix is found, we
call a routine to adjust the instruction pointer.  It is expected that
this opcode is not in common use.

The following patch will make use of this function. Since there are no
other users (yet?), the patch does not bother to create a general
infrastructure and API that others can use for such sequences. Yet, it
should not be hard to make such extension later.

Signed-off-by: Nadav Amit 
---
 arch/x86/entry/entry_64.S| 16 ++--
 arch/x86/include/asm/nospec-branch.h | 12 
 arch/x86/kernel/traps.c  |  7 +++
 3 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 1f0efdb7b629..e144ff8b914f 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -644,12 +644,24 @@ retint_kernel:
/* Interrupts are off */
/* Check if we need preemption */
btl $9, EFLAGS(%rsp)/* were interrupts off? */
-   jnc 1f
+   jnc 2f
 0: cmpl$0, PER_CPU_VAR(__preempt_count)
+   jnz 2f
+
+   /*
+* Allow to use restartable code sections in the kernel. Consider an
+* instruction with the first byte having REX prefix without any bits
+* set as an indication for an instruction in such a section.
+*/
+   movqRIP(%rsp), %rax
+   cmpb$KERNEL_RESTARTABLE_PREFIX, (%rax)
jnz 1f
+   mov %rsp, %rdi
+   callrestart_kernel_rseq
+1:
callpreempt_schedule_irq
jmp 0b
-1:
+2:
 #endif
/*
 * The iretq could re-enable interrupts:
diff --git a/arch/x86/include/asm/nospec-branch.h 
b/arch/x86/include/asm/nospec-branch.h
index dad12b767ba0..be4713ef0940 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -54,6 +54,12 @@
jnz 771b;   \
add $(BITS_PER_LONG/8) * nr, sp;
 
+/*
+ * An empty REX-prefix is an indication that this instruction is part of kernel
+ * restartable sequence.
+ */
+#define KERNEL_RESTARTABLE_PREFIX  (0x40)
+
 #ifdef __ASSEMBLY__
 
 /*
@@ -150,6 +156,12 @@
 #endif
 .endm
 
+.macro restartable_seq_prefix
+#ifdef CONFIG_PREEMPT
+   .byte   KERNEL_RESTARTABLE_PREFIX
+#endif
+.endm
+
 #else /* __ASSEMBLY__ */
 
 #define ANNOTATE_NOSPEC_ALTERNATIVE\
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 85cccadb9a65..b1e855bad5ac 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -59,6 +59,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #ifdef CONFIG_X86_64
 #include 
@@ -186,6 +187,12 @@ int fixup_bug(struct pt_regs *regs, int trapnr)
return 0;
 }
 
+asmlinkage __visible void restart_kernel_rseq(struct pt_regs *regs)
+{
+   if (user_mode(regs) || *(u8 *)regs->ip != KERNEL_RESTARTABLE_PREFIX)
+   return;
+}
+
 static nokprobe_inline int
 do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str,
  struct pt_regs *regs, long error_code)
-- 
2.17.1