[RFC PATCH 2/3] restartable sequences: x86 ABI

2015-10-22 Thread Dave Watson
Implements the x86 (i386 & x86-64) ABIs for interrupting and restarting
execution within restartable sequence sections.

Ptrace is modified to single step over the entire critical region.
---
 arch/x86/entry/common.c  |  3 ++
 arch/x86/entry/syscalls/syscall_64.tbl   |  1 +
 arch/x86/include/asm/restartable_sequences.h | 44 ++
 arch/x86/kernel/Makefile |  2 ++
 arch/x86/kernel/ptrace.c |  6 ++--
 arch/x86/kernel/restartable_sequences.c  | 47 
 arch/x86/kernel/signal.c | 12 ++-
 kernel/restartable_sequences.c   | 11 +--
 8 files changed, 120 insertions(+), 6 deletions(-)
 create mode 100644 arch/x86/include/asm/restartable_sequences.h
 create mode 100644 arch/x86/kernel/restartable_sequences.c

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 80dcc92..e817f04 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -24,6 +24,7 @@
 
 #include 
 #include 
+#include 
 
 #define CREATE_TRACE_POINTS
 #include 
@@ -253,6 +254,8 @@ __visible void prepare_exit_to_usermode(struct pt_regs 
*regs)
if (cached_flags & _TIF_NOTIFY_RESUME) {
clear_thread_flag(TIF_NOTIFY_RESUME);
tracehook_notify_resume(regs);
+   if (rseq_active(current))
+   arch_rseq_handle_notify_resume(regs);
}
 
if (cached_flags & _TIF_USER_RETURN_NOTIFY)
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl 
b/arch/x86/entry/syscalls/syscall_64.tbl
index 278842f..0fd4243 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -331,6 +331,7 @@
 32264  execveatstub_execveat
 323common  userfaultfd sys_userfaultfd
 324common  membarrier  sys_membarrier
+325common  restartable_sequences   sys_restartable_sequences
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/x86/include/asm/restartable_sequences.h 
b/arch/x86/include/asm/restartable_sequences.h
new file mode 100644
index 000..c0bcab2
--- /dev/null
+++ b/arch/x86/include/asm/restartable_sequences.h
@@ -0,0 +1,44 @@
+#ifndef _ASM_X86_RESTARTABLE_SEQUENCES_H
+#define _ASM_X86_RESTARTABLE_SEQUENCES_H
+
+#include 
+#include 
+#include 
+
+#ifdef CONFIG_RESTARTABLE_SEQUENCES
+
+static inline unsigned long arch_rseq_in_crit_section(struct task_struct *p,
+   struct pt_regs *regs)
+{
+   unsigned long ip = (unsigned long)regs->ip;
+
+   return rseq_lookup(p, ip);
+}
+
+static inline bool arch_rseq_needs_notify_resume(struct task_struct *p)
+{
+#ifdef CONFIG_PREEMPT
+   /*
+* Under CONFIG_PREEMPT it's possible for regs to be incoherent in the
+* case that we took an interrupt during syscall entry.  Avoid this by
+* always deferring to our notify-resume handler.
+*/
+   return true;
+#else
+   return arch_rseq_in_crit_section(p, task_pt_regs(p));
+#endif
+}
+
+void arch_rseq_handle_notify_resume(struct pt_regs *regs);
+void arch_rseq_check_critical_section(struct task_struct *p,
+ struct pt_regs *regs);
+
+#else /* !CONFIG_RESTARTABLE_SEQUENCES */
+
+static inline void arch_rseq_handle_notify_resume(struct pt_regs *regs) {}
+static inline void arch_rseq_check_critical_section(struct task_struct *p,
+   struct pt_regs *regs) {}
+
+#endif
+
+#endif /* _ASM_X86_RESTARTABLE_SEQUENCES_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index b1b78ff..ee98fb6 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -110,6 +110,8 @@ obj-$(CONFIG_EFI)   += sysfb_efi.o
 obj-$(CONFIG_PERF_EVENTS)  += perf_regs.o
 obj-$(CONFIG_TRACING)  += tracepoint.o
 
+obj-$(CONFIG_RESTARTABLE_SEQUENCES)+= restartable_sequences.o
+
 ###
 # 64 bit specific files
 ifeq ($(CONFIG_X86_64),y)
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 558f50e..934aeaf 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1439,6 +1439,8 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs 
*regs,
struct siginfo info;
 
fill_sigtrap_info(tsk, regs, error_code, si_code, &info);
-   /* Send us the fake SIGTRAP */
-   force_sig_info(SIGTRAP, &info, tsk);
+   /* Don't single step in to a restartable sequence */
+   if (!rseq_lookup(tsk, (unsigned long)regs->ip))
+   /* Send us the fake SIGTRAP */
+   force_sig_info(SIGTRAP, &info, tsk);
 }
diff --git a/arch/x86/kernel/restartable_sequences.c 
b/arch/x86/kernel/restartable_sequences.c
new file mode 100644
index 000..330568a
--- /dev/null
+++ b/arch/x86/kernel/resta

Re: [RFC PATCH 2/3] restartable sequences: x86 ABI

2015-06-26 Thread Paul Turner
On Fri, Jun 26, 2015 at 12:31 PM, Andy Lutomirski  wrote:
> On Fri, Jun 26, 2015 at 11:09 AM, Mathieu Desnoyers
>  wrote:
>> - On Jun 24, 2015, at 6:26 PM, Paul Turner p...@google.com wrote:
>>
>>> Implements the x86 (i386 & x86-64) ABIs for interrupting and restarting
>>> execution within restartable sequence sections.
>>>
>>> With respect to the x86-specific ABI:
>>>  On 32-bit:   Upon restart, the interrupted rip is placed in %ecx
>>>  On 64-bit (or x32):  Upon restart, the interrupted rip is placed in %r10
>>>
>>> While potentially surprising at first glance, this choice is strongly 
>>> motivated
>>> by the fact that the available scratch registers under the i386 function 
>>> call
>>> ABI overlap with those used as argument registers under x86_64.
>>>
>>> Given that sequences are already personality specific and that we always 
>>> want
>>> the arguments to be available for sequence restart, it's much more natural 
>>> to
>>> ultimately differentiate the ABI in these two cases.
>>>
>>> Signed-off-by: Paul Turner 
>>> ---
>>> arch/x86/include/asm/restartable_sequences.h |   50 +++
>>> arch/x86/kernel/Makefile |2 +
>>> arch/x86/kernel/restartable_sequences.c  |   69 
>>> ++
>>> arch/x86/kernel/signal.c |   12 +
>>> kernel/restartable_sequences.c   |   11 +++-
>>> 5 files changed, 141 insertions(+), 3 deletions(-)
>>> create mode 100644 arch/x86/include/asm/restartable_sequences.h
>>> create mode 100644 arch/x86/kernel/restartable_sequences.c
>>>
>>> diff --git a/arch/x86/include/asm/restartable_sequences.h
>>> b/arch/x86/include/asm/restartable_sequences.h
>>> new file mode 100644
>>> index 000..0ceb024
>>> --- /dev/null
>>> +++ b/arch/x86/include/asm/restartable_sequences.h
>>> @@ -0,0 +1,50 @@
>>> +#ifndef _ASM_X86_RESTARTABLE_SEQUENCES_H
>>> +#define _ASM_X86_RESTARTABLE_SEQUENCES_H
>>> +
>>> +#include 
>>> +#include 
>>> +#include 
>>> +
>>> +#ifdef CONFIG_RESTARTABLE_SEQUENCES
>>> +
>>> +static inline bool arch_rseq_in_crit_section(struct task_struct *p,
>>> +  struct pt_regs *regs)
>>> +{
>>> + struct task_struct *leader = p->group_leader;
>>> + struct restartable_sequence_state *rseq_state = &leader->rseq_state;
>>> +
>>> + unsigned long ip = (unsigned long)regs->ip;
>>> + if (unlikely(ip < (unsigned long)rseq_state->crit_end &&
>>> +  ip >= (unsigned long)rseq_state->crit_start))
>>> + return true;
>>> +
>>> + return false;
>>> +}
>>> +
>>> +static inline bool arch_rseq_needs_notify_resume(struct task_struct *p)
>>> +{
>>> +#ifdef CONFIG_PREEMPT
>>> + /*
>>> +  * Under CONFIG_PREEMPT it's possible for regs to be incoherent in the
>>> +  * case that we took an interrupt during syscall entry.  Avoid this by
>>> +  * always deferring to our notify-resume handler.
>>> +  */
>>> + return true;
>>
>> I'm a bit puzzled about this. If I look at perf_get_regs_user() in the perf
>> code, task_pt_regs() seems to return the user-space pt_regs for a task with
>> a current->mm set (iow, not a kernel thread), even if an interrupt nests on
>> top of a system call. The only corner-case is NMIs, where an NMI may 
>> interrupt
>> in the middle of setting up the task pt_regs, but scheduling should never 
>> happen
>> there, right ?
>
> Careful, here!  task_pt_regs returns a pointer to the place where regs
> would be if they were fully initialized.  We can certainly take an
> interrupt in the middle of pt_regs setup (entry_SYSCALL_64 enables
> interrupts very early, for example).  To me, the question is whether
> we can ever be preemptable at such a time.
>
> It's a bit worse, though: we can certainly be preemptible when other
> code is accessing pt_regs.  clone, execve, sigreturn, and signal
> delivery come to mind.

Yeah Andy covered it exactly: interrupt in pt_regs setup.

With respect to whether we can be preemptible; I think we were
concerned about rescheduling during syscall entry but I'd have to
re-audit the current state of entry_64.S :)

Mathieu also wrote:
> Moving ENABLE_INTERRUPTS(CLBR_NONE) 3 instructions down, just after
> pushq   %rcx/* pt_regs->ip */
> might solve your issue here. (in entry_SYSCALL_64_after_swapgs)

We considered doing something exactly like this; but I think any
potential changes here should be made in isolation of this series.

>
> Why don't we give up on poking at user state from the scheduler and do
> it on exit to user mode instead?  Starting in 4.3 (hopefully landing
> in -tip in a week or two), we should have a nice function
> prepare_exit_to_usermode that runs with well-defined state,
> non-reentrantly, that can do whatever you want here, *including user
> memory access*.

So this series already does the exact approximation of that:
The only thing we touch in the scheduler is looking at the kernel copy
pt_regs in the ca

Re: [RFC PATCH 2/3] restartable sequences: x86 ABI

2015-06-26 Thread Andy Lutomirski
On Fri, Jun 26, 2015 at 11:09 AM, Mathieu Desnoyers
 wrote:
> - On Jun 24, 2015, at 6:26 PM, Paul Turner p...@google.com wrote:
>
>> Implements the x86 (i386 & x86-64) ABIs for interrupting and restarting
>> execution within restartable sequence sections.
>>
>> With respect to the x86-specific ABI:
>>  On 32-bit:   Upon restart, the interrupted rip is placed in %ecx
>>  On 64-bit (or x32):  Upon restart, the interrupted rip is placed in %r10
>>
>> While potentially surprising at first glance, this choice is strongly 
>> motivated
>> by the fact that the available scratch registers under the i386 function call
>> ABI overlap with those used as argument registers under x86_64.
>>
>> Given that sequences are already personality specific and that we always want
>> the arguments to be available for sequence restart, it's much more natural to
>> ultimately differentiate the ABI in these two cases.
>>
>> Signed-off-by: Paul Turner 
>> ---
>> arch/x86/include/asm/restartable_sequences.h |   50 +++
>> arch/x86/kernel/Makefile |2 +
>> arch/x86/kernel/restartable_sequences.c  |   69 
>> ++
>> arch/x86/kernel/signal.c |   12 +
>> kernel/restartable_sequences.c   |   11 +++-
>> 5 files changed, 141 insertions(+), 3 deletions(-)
>> create mode 100644 arch/x86/include/asm/restartable_sequences.h
>> create mode 100644 arch/x86/kernel/restartable_sequences.c
>>
>> diff --git a/arch/x86/include/asm/restartable_sequences.h
>> b/arch/x86/include/asm/restartable_sequences.h
>> new file mode 100644
>> index 000..0ceb024
>> --- /dev/null
>> +++ b/arch/x86/include/asm/restartable_sequences.h
>> @@ -0,0 +1,50 @@
>> +#ifndef _ASM_X86_RESTARTABLE_SEQUENCES_H
>> +#define _ASM_X86_RESTARTABLE_SEQUENCES_H
>> +
>> +#include 
>> +#include 
>> +#include 
>> +
>> +#ifdef CONFIG_RESTARTABLE_SEQUENCES
>> +
>> +static inline bool arch_rseq_in_crit_section(struct task_struct *p,
>> +  struct pt_regs *regs)
>> +{
>> + struct task_struct *leader = p->group_leader;
>> + struct restartable_sequence_state *rseq_state = &leader->rseq_state;
>> +
>> + unsigned long ip = (unsigned long)regs->ip;
>> + if (unlikely(ip < (unsigned long)rseq_state->crit_end &&
>> +  ip >= (unsigned long)rseq_state->crit_start))
>> + return true;
>> +
>> + return false;
>> +}
>> +
>> +static inline bool arch_rseq_needs_notify_resume(struct task_struct *p)
>> +{
>> +#ifdef CONFIG_PREEMPT
>> + /*
>> +  * Under CONFIG_PREEMPT it's possible for regs to be incoherent in the
>> +  * case that we took an interrupt during syscall entry.  Avoid this by
>> +  * always deferring to our notify-resume handler.
>> +  */
>> + return true;
>
> I'm a bit puzzled about this. If I look at perf_get_regs_user() in the perf
> code, task_pt_regs() seems to return the user-space pt_regs for a task with
> a current->mm set (iow, not a kernel thread), even if an interrupt nests on
> top of a system call. The only corner-case is NMIs, where an NMI may interrupt
> in the middle of setting up the task pt_regs, but scheduling should never 
> happen
> there, right ?

Careful, here!  task_pt_regs returns a pointer to the place where regs
would be if they were fully initialized.  We can certainly take an
interrupt in the middle of pt_regs setup (entry_SYSCALL_64 enables
interrupts very early, for example).  To me, the question is whether
we can ever be preemptable at such a time.

It's a bit worse, though: we can certainly be preemptible when other
code is accessing pt_regs.  clone, execve, sigreturn, and signal
delivery come to mind.

Why don't we give up on poking at user state from the scheduler and do
it on exit to user mode instead?  Starting in 4.3 (hopefully landing
in -tip in a week or two), we should have a nice function
prepare_exit_to_usermode that runs with well-defined state,
non-reentrantly, that can do whatever you want here, *including user
memory access*.

The remaining question would be what the ABI should be.

Could we get away with a vDSO function along the lines of "set *A=B
and *X=Y if we're on cpu N and *X=Z"?  Straight-up cmpxchg would be
even simpler.

--Andy
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH 2/3] restartable sequences: x86 ABI

2015-06-26 Thread Mathieu Desnoyers
- On Jun 26, 2015, at 2:09 PM, Mathieu Desnoyers 
mathieu.desnoy...@efficios.com wrote:

> - On Jun 24, 2015, at 6:26 PM, Paul Turner p...@google.com wrote:
> 
>> Implements the x86 (i386 & x86-64) ABIs for interrupting and restarting
>> execution within restartable sequence sections.
>> 
>> With respect to the x86-specific ABI:
>>  On 32-bit:   Upon restart, the interrupted rip is placed in %ecx
>>  On 64-bit (or x32):  Upon restart, the interrupted rip is placed in %r10
>> 
>> While potentially surprising at first glance, this choice is strongly 
>> motivated
>> by the fact that the available scratch registers under the i386 function call
>> ABI overlap with those used as argument registers under x86_64.
>> 
>> Given that sequences are already personality specific and that we always want
>> the arguments to be available for sequence restart, it's much more natural to
>> ultimately differentiate the ABI in these two cases.
>> 
>> Signed-off-by: Paul Turner 
>> ---
>> arch/x86/include/asm/restartable_sequences.h |   50 +++
>> arch/x86/kernel/Makefile |2 +
>> arch/x86/kernel/restartable_sequences.c  |   69 
>> ++
>> arch/x86/kernel/signal.c |   12 +
>> kernel/restartable_sequences.c   |   11 +++-
>> 5 files changed, 141 insertions(+), 3 deletions(-)
>> create mode 100644 arch/x86/include/asm/restartable_sequences.h
>> create mode 100644 arch/x86/kernel/restartable_sequences.c
>> 
>> diff --git a/arch/x86/include/asm/restartable_sequences.h
>> b/arch/x86/include/asm/restartable_sequences.h
>> new file mode 100644
>> index 000..0ceb024
>> --- /dev/null
>> +++ b/arch/x86/include/asm/restartable_sequences.h
>> @@ -0,0 +1,50 @@
>> +#ifndef _ASM_X86_RESTARTABLE_SEQUENCES_H
>> +#define _ASM_X86_RESTARTABLE_SEQUENCES_H
>> +
>> +#include 
>> +#include 
>> +#include 
>> +
>> +#ifdef CONFIG_RESTARTABLE_SEQUENCES
>> +
>> +static inline bool arch_rseq_in_crit_section(struct task_struct *p,
>> + struct pt_regs *regs)
>> +{
>> +struct task_struct *leader = p->group_leader;
>> +struct restartable_sequence_state *rseq_state = &leader->rseq_state;
>> +
>> +unsigned long ip = (unsigned long)regs->ip;
>> +if (unlikely(ip < (unsigned long)rseq_state->crit_end &&
>> + ip >= (unsigned long)rseq_state->crit_start))
>> +return true;
>> +
>> +return false;
>> +}
>> +
>> +static inline bool arch_rseq_needs_notify_resume(struct task_struct *p)
>> +{
>> +#ifdef CONFIG_PREEMPT
>> +/*
>> + * Under CONFIG_PREEMPT it's possible for regs to be incoherent in the
>> + * case that we took an interrupt during syscall entry.  Avoid this by
>> + * always deferring to our notify-resume handler.
>> + */
>> +return true;
> 
> I'm a bit puzzled about this. If I look at perf_get_regs_user() in the perf
> code, task_pt_regs() seems to return the user-space pt_regs for a task with
> a current->mm set (iow, not a kernel thread), even if an interrupt nests on
> top of a system call. The only corner-case is NMIs, where an NMI may interrupt
> in the middle of setting up the task pt_regs, but scheduling should never 
> happen
> there, right ?
> 
> Since it's impossible for kernel threads to have a rseq critical section,
> we should be able to assume that every time task_pt_regs() returns a
> non-userspace (user_mode(regs) != 0) pt_regs implies that scheduling applies
> to a kernel thread. Therefore, following this line of thoughts,
> arch_rseq_in_crit_section() should work for CONFIG_PREEMPT kernels too.
> 
> So what I am missing here ?

AFAIU, the comment near this check in perf_get_regs_user() is bogus.
It does not only apply to NMIs, but also applies to normal interrupt
handlers that nest over the stack setup on syscall entry (below
entry_SYSCALL_64_after_swapgs in entry_64.S):

struct pt_regs *user_regs = task_pt_regs(current);

/*
 * If we're in an NMI that interrupted task_pt_regs setup, then
 * we can't sample user regs at all.  This check isn't really
 * sufficient, though, as we could be in an NMI inside an interrupt
 * that happened during task_pt_regs setup.
 */
if (regs->sp > (unsigned long)&user_regs->r11 &&
regs->sp <= (unsigned long)(user_regs + 1)) {
regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
regs_user->regs = NULL;
return;
}

That would be how, for tracing, those races can be avoided. It
might not be a huge issue for perf samples to lose one sample once
in a while, but I understand that this statistical approach would
be incorrect in the context of RSEQ.

Moving ENABLE_INTERRUPTS(CLBR_NONE) 3 instructions down, just after
pushq   %rcx/* pt_regs->ip */
might solve your issue here. (in entry_SYSCALL_64_after_swapgs)

Thoughts ?

Thanks,

Mathieu


> 
> T

Re: [RFC PATCH 2/3] restartable sequences: x86 ABI

2015-06-26 Thread Mathieu Desnoyers
- On Jun 24, 2015, at 6:26 PM, Paul Turner p...@google.com wrote:

> Implements the x86 (i386 & x86-64) ABIs for interrupting and restarting
> execution within restartable sequence sections.
> 
> With respect to the x86-specific ABI:
>  On 32-bit:   Upon restart, the interrupted rip is placed in %ecx
>  On 64-bit (or x32):  Upon restart, the interrupted rip is placed in %r10
> 
> While potentially surprising at first glance, this choice is strongly 
> motivated
> by the fact that the available scratch registers under the i386 function call
> ABI overlap with those used as argument registers under x86_64.
> 
> Given that sequences are already personality specific and that we always want
> the arguments to be available for sequence restart, it's much more natural to
> ultimately differentiate the ABI in these two cases.
> 
> Signed-off-by: Paul Turner 
> ---
> arch/x86/include/asm/restartable_sequences.h |   50 +++
> arch/x86/kernel/Makefile |2 +
> arch/x86/kernel/restartable_sequences.c  |   69 ++
> arch/x86/kernel/signal.c |   12 +
> kernel/restartable_sequences.c   |   11 +++-
> 5 files changed, 141 insertions(+), 3 deletions(-)
> create mode 100644 arch/x86/include/asm/restartable_sequences.h
> create mode 100644 arch/x86/kernel/restartable_sequences.c
> 
> diff --git a/arch/x86/include/asm/restartable_sequences.h
> b/arch/x86/include/asm/restartable_sequences.h
> new file mode 100644
> index 000..0ceb024
> --- /dev/null
> +++ b/arch/x86/include/asm/restartable_sequences.h
> @@ -0,0 +1,50 @@
> +#ifndef _ASM_X86_RESTARTABLE_SEQUENCES_H
> +#define _ASM_X86_RESTARTABLE_SEQUENCES_H
> +
> +#include 
> +#include 
> +#include 
> +
> +#ifdef CONFIG_RESTARTABLE_SEQUENCES
> +
> +static inline bool arch_rseq_in_crit_section(struct task_struct *p,
> +  struct pt_regs *regs)
> +{
> + struct task_struct *leader = p->group_leader;
> + struct restartable_sequence_state *rseq_state = &leader->rseq_state;
> +
> + unsigned long ip = (unsigned long)regs->ip;
> + if (unlikely(ip < (unsigned long)rseq_state->crit_end &&
> +  ip >= (unsigned long)rseq_state->crit_start))
> + return true;
> +
> + return false;
> +}
> +
> +static inline bool arch_rseq_needs_notify_resume(struct task_struct *p)
> +{
> +#ifdef CONFIG_PREEMPT
> + /*
> +  * Under CONFIG_PREEMPT it's possible for regs to be incoherent in the
> +  * case that we took an interrupt during syscall entry.  Avoid this by
> +  * always deferring to our notify-resume handler.
> +  */
> + return true;

I'm a bit puzzled about this. If I look at perf_get_regs_user() in the perf
code, task_pt_regs() seems to return the user-space pt_regs for a task with
a current->mm set (iow, not a kernel thread), even if an interrupt nests on
top of a system call. The only corner-case is NMIs, where an NMI may interrupt
in the middle of setting up the task pt_regs, but scheduling should never happen
there, right ?

Since it's impossible for kernel threads to have a rseq critical section,
we should be able to assume that every time task_pt_regs() returns a
non-userspace (user_mode(regs) != 0) pt_regs implies that scheduling applies
to a kernel thread. Therefore, following this line of thoughts,
arch_rseq_in_crit_section() should work for CONFIG_PREEMPT kernels too.

So what I am missing here ?

Thanks,

Mathieu

> +#else
> + return arch_rseq_in_crit_section(p, task_pt_regs(p));
> +#endif
> +}
> +
> +void arch_rseq_handle_notify_resume(struct pt_regs *regs);
> +void arch_rseq_check_critical_section(struct task_struct *p,
> +   struct pt_regs *regs);
> +
> +#else /* !CONFIG_RESTARTABLE_SEQUENCES */
> +
> +static inline void arch_rseq_handle_notify_resume(struct pt_regs *regs) {}
> +static inline void arch_rseq_check_critical_section(struct task_struct *p,
> + struct pt_regs *regs) {}
> +
> +#endif
> +
> +#endif /* _ASM_X86_RESTARTABLE_SEQUENCES_H */
> diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
> index febaf18..bd7827d 100644
> --- a/arch/x86/kernel/Makefile
> +++ b/arch/x86/kernel/Makefile
> @@ -113,6 +113,8 @@ obj-$(CONFIG_TRACING) += tracepoint.o
> obj-$(CONFIG_IOSF_MBI)+= iosf_mbi.o
> obj-$(CONFIG_PMC_ATOM)+= pmc_atom.o
> 
> +obj-$(CONFIG_RESTARTABLE_SEQUENCES)  += restartable_sequences.o
> +
> ###
> # 64 bit specific files
> ifeq ($(CONFIG_X86_64),y)
> diff --git a/arch/x86/kernel/restartable_sequences.c
> b/arch/x86/kernel/restartable_sequences.c
> new file mode 100644
> index 000..3b38013
> --- /dev/null
> +++ b/arch/x86/kernel/restartable_sequences.c
> @@ -0,0 +1,69 @@
> +/*
> + * Restartable Sequences: x86 ABI.
> + *
> + * This program is free software; you can redistribute it and/or m

[RFC PATCH 2/3] restartable sequences: x86 ABI

2015-06-24 Thread Paul Turner
Implements the x86 (i386 & x86-64) ABIs for interrupting and restarting
execution within restartable sequence sections.

With respect to the x86-specific ABI:
  On 32-bit:   Upon restart, the interrupted rip is placed in %ecx
  On 64-bit (or x32):  Upon restart, the interrupted rip is placed in %r10

While potentially surprising at first glance, this choice is strongly motivated
by the fact that the available scratch registers under the i386 function call
ABI overlap with those used as argument registers under x86_64.

Given that sequences are already personality specific and that we always want
the arguments to be available for sequence restart, it's much more natural to
ultimately differentiate the ABI in these two cases.

Signed-off-by: Paul Turner 
---
 arch/x86/include/asm/restartable_sequences.h |   50 +++
 arch/x86/kernel/Makefile |2 +
 arch/x86/kernel/restartable_sequences.c  |   69 ++
 arch/x86/kernel/signal.c |   12 +
 kernel/restartable_sequences.c   |   11 +++-
 5 files changed, 141 insertions(+), 3 deletions(-)
 create mode 100644 arch/x86/include/asm/restartable_sequences.h
 create mode 100644 arch/x86/kernel/restartable_sequences.c

diff --git a/arch/x86/include/asm/restartable_sequences.h 
b/arch/x86/include/asm/restartable_sequences.h
new file mode 100644
index 000..0ceb024
--- /dev/null
+++ b/arch/x86/include/asm/restartable_sequences.h
@@ -0,0 +1,50 @@
+#ifndef _ASM_X86_RESTARTABLE_SEQUENCES_H
+#define _ASM_X86_RESTARTABLE_SEQUENCES_H
+
+#include 
+#include 
+#include 
+
+#ifdef CONFIG_RESTARTABLE_SEQUENCES
+
+static inline bool arch_rseq_in_crit_section(struct task_struct *p,
+struct pt_regs *regs)
+{
+   struct task_struct *leader = p->group_leader;
+   struct restartable_sequence_state *rseq_state = &leader->rseq_state;
+
+   unsigned long ip = (unsigned long)regs->ip;
+   if (unlikely(ip < (unsigned long)rseq_state->crit_end &&
+ip >= (unsigned long)rseq_state->crit_start))
+   return true;
+
+   return false;
+}
+
+static inline bool arch_rseq_needs_notify_resume(struct task_struct *p)
+{
+#ifdef CONFIG_PREEMPT
+   /*
+* Under CONFIG_PREEMPT it's possible for regs to be incoherent in the
+* case that we took an interrupt during syscall entry.  Avoid this by
+* always deferring to our notify-resume handler.
+*/
+   return true;
+#else
+   return arch_rseq_in_crit_section(p, task_pt_regs(p));
+#endif
+}
+
+void arch_rseq_handle_notify_resume(struct pt_regs *regs);
+void arch_rseq_check_critical_section(struct task_struct *p,
+ struct pt_regs *regs);
+
+#else /* !CONFIG_RESTARTABLE_SEQUENCES */
+
+static inline void arch_rseq_handle_notify_resume(struct pt_regs *regs) {}
+static inline void arch_rseq_check_critical_section(struct task_struct *p,
+   struct pt_regs *regs) {}
+
+#endif
+
+#endif /* _ASM_X86_RESTARTABLE_SEQUENCES_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index febaf18..bd7827d 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -113,6 +113,8 @@ obj-$(CONFIG_TRACING)   += tracepoint.o
 obj-$(CONFIG_IOSF_MBI) += iosf_mbi.o
 obj-$(CONFIG_PMC_ATOM) += pmc_atom.o
 
+obj-$(CONFIG_RESTARTABLE_SEQUENCES)+= restartable_sequences.o
+
 ###
 # 64 bit specific files
 ifeq ($(CONFIG_X86_64),y)
diff --git a/arch/x86/kernel/restartable_sequences.c 
b/arch/x86/kernel/restartable_sequences.c
new file mode 100644
index 000..3b38013
--- /dev/null
+++ b/arch/x86/kernel/restartable_sequences.c
@@ -0,0 +1,69 @@
+/*
+ * Restartable Sequences: x86 ABI.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright (C) 2015, Google, Inc.,
+ * Paul Turner  and Andrew Hunter 
+ *
+ */
+
+#include 
+#include 
+#include 
+
+void arch_rseq_check_critical_section(struct task_struct *p,
+ struct pt_regs *regs)
+{
+   if (!arch_rseq_in_crit_section(p, regs))
+   return;
+
+   /* RSEQ only applies to user-mode execution */
+   BUG_ON(!u