The x86-64 (and 32-bit, for that matter) system call slowpaths are all
in C, but the *selection* of which slow-path to take is a mixture of
complicated assembler ("sysret_check -> sysret_careful ->
sysret_signal ->sysret_audit -> int_check_syscall_exit_work" etc), and
oddly named and placed C code ("schedule_user" vs
"__audit_syscall_exit" vs "do_notify_resume").

This attached patch tries to take the "do_notify_resume()" approach,
and renaming it to something sane ("syscall_exit_slowpath") and call
out to *all* the different slow cases from that one place, instead of
having some cases hardcoded in asm, and some in C. And instead of
hardcoding which cases result in a "iretq" and which cases result in a
faster sysret case, it's now simply a return value from that
syscall_exit_slowpath() function, so it's very natural and easy to say
"taking a signal will force us to do the slow iretq case, but we can
do the task exit work and still do the sysret".

I've marked this as an RFC, because I didn't bother trying to clean up
the 32-bit code similarly (no test-cases, and trust me, if you get
this wrong, it will fail spectacularly but in very subtle and
hard-to-debug ways), and I also didn't bother with the slow cases in
the "iretq" path, so that path still has the odd asm cases and calls
the old (now legacy) do_notify_resume() path.

But this is actually tested, and seems to work (including limited
testing with strace, gdb etc), and while it adds a few more lines than
it removes, the removed lines are mostly asm, and added lines are C
(and part of them are the temporary still-extant do_notify_resume()
wrapper). In particular, it should be fairly straightforward to take
this as a starting point, removing the extant
do_notify_resume/schedule/etc cases one by one, and get rid of more
asm code and finally the wrapper.

Comments? This was obviously brought on by my frustration with the
currently nasty do_notify_resume() always returning to iret for the
task_work case, and PeterZ's patch that fixed that, but made the asm
mess even *worse*.

                        Linus
 arch/x86/kernel/entry_64.S | 51 ++++++++++++++--------------------------------
 arch/x86/kernel/signal.c   | 37 ++++++++++++++++++++++++++++-----
 2 files changed, 47 insertions(+), 41 deletions(-)

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 1e96c3628bf2..15b5953da958 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -658,30 +658,24 @@ sysret_check:
        /* Handle reschedules */
        /* edx: work, edi: workmask */
 sysret_careful:
-       bt $TIF_NEED_RESCHED,%edx
-       jnc sysret_signal
        TRACE_IRQS_ON
        ENABLE_INTERRUPTS(CLBR_NONE)
-       pushq_cfi %rdi
-       SCHEDULE_USER
-       popq_cfi %rdi
-       jmp sysret_check
+       SAVE_REST
+       FIXUP_TOP_OF_STACK %r11
+       movq %rsp,%rdi
+       movl %edx,%esi
+       call syscall_exit_slowpath
+       movl $_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT,%edi
+       testl %eax,%eax
+       jne sysret_using_iret
+       addq $REST_SKIP,%rsp
+       jmp sysret_check;
 
-       /* Handle a signal */
-sysret_signal:
-       TRACE_IRQS_ON
-       ENABLE_INTERRUPTS(CLBR_NONE)
-#ifdef CONFIG_AUDITSYSCALL
-       bt $TIF_SYSCALL_AUDIT,%edx
-       jc sysret_audit
-#endif
-       /*
-        * We have a signal, or exit tracing or single-step.
-        * These all wind up with the iret return path anyway,
-        * so just join that path right now.
-        */
-       FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
-       jmp int_check_syscall_exit_work
+sysret_using_iret:
+       RESTORE_REST
+       DISABLE_INTERRUPTS(CLBR_NONE)
+       TRACE_IRQS_OFF
+       jmp int_with_check
 
 badsys:
        movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
@@ -703,20 +697,6 @@ auditsys:
        call __audit_syscall_entry
        LOAD_ARGS 0             /* reload call-clobbered registers */
        jmp system_call_fastpath
-
-       /*
-        * Return fast path for syscall audit.  Call __audit_syscall_exit()
-        * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
-        * masked off.
-        */
-sysret_audit:
-       movq RAX-ARGOFFSET(%rsp),%rsi   /* second arg, syscall return value */
-       cmpq $-MAX_ERRNO,%rsi   /* is it < -MAX_ERRNO? */
-       setbe %al               /* 1 if so, 0 if not */
-       movzbl %al,%edi         /* zero-extend that into %edi */
-       call __audit_syscall_exit
-       movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
-       jmp sysret_check
 #endif /* CONFIG_AUDITSYSCALL */
 
        /* Do syscall tracing */
@@ -786,7 +766,6 @@ int_careful:
 int_very_careful:
        TRACE_IRQS_ON
        ENABLE_INTERRUPTS(CLBR_NONE)
-int_check_syscall_exit_work:
        SAVE_REST
        /* Check for syscall exit trace */
        testl $_TIF_WORK_SYSCALL_EXIT,%edx
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 9e5de6813e1f..25d02f6a65f1 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -725,13 +725,30 @@ static void do_signal(struct pt_regs *regs)
 }
 
 /*
- * notification of userspace execution resumption
- * - triggered by the TIF_WORK_MASK flags
+ * This is the system call exit slowpath, called if any of
+ * the thread info flags are set that cause us to not just
+ * return using a 'sysret' instruction.
+ *
+ * Return zero if the user register state hasn't changed,
+ * and a 'sysret' is still ok. Return nonzero if we need
+ * to go to the slow 'iret' path due to possible register
+ * changes (signals, ptrace, whatever).
  */
-__visible void
-do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
+__visible int syscall_exit_slowpath(struct pt_regs *regs, u32 
thread_info_flags)
 {
+       int retval = 0;
+
        user_exit();
+       if (thread_info_flags & _TIF_NEED_RESCHED)
+               schedule();
+
+       if (thread_info_flags & _TIF_WORK_SYSCALL_EXIT)
+               syscall_trace_leave(regs);
+
+#ifdef CONFIG_AUDITSYSCALL
+       if (thread_info_flags & _TIF_SYSCALL_AUDIT)
+               __audit_syscall_exit(regs->ax, regs->ax < -MAX_ERRNO);
+#endif
 
 #ifdef CONFIG_X86_MCE
        /* notify userspace of pending MCEs */
@@ -743,8 +760,10 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 
thread_info_flags)
                uprobe_notify_resume(regs);
 
        /* deal with pending signal delivery */
-       if (thread_info_flags & _TIF_SIGPENDING)
+       if (thread_info_flags & _TIF_SIGPENDING) {
                do_signal(regs);
+               retval = 1;
+       }
 
        if (thread_info_flags & _TIF_NOTIFY_RESUME) {
                clear_thread_flag(TIF_NOTIFY_RESUME);
@@ -754,8 +773,16 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 
thread_info_flags)
                fire_user_return_notifiers();
 
        user_enter();
+       return retval;
 }
 
+/* Get rid of this old interface some day.. */
+__visible void do_notify_resume(struct pt_regs *regs, void *unused, u32 
thread_info_flags)
+{
+       syscall_exit_slowpath(regs, thread_info_flags & 
~(_TIF_NEED_RESCHED|_TIF_SYSCALL_AUDIT));
+}
+
+
 void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
 {
        struct task_struct *me = current;

Reply via email to