Create an IRET-compatible top of stack at syscall entry and use this
information to return to user mode in the sysret path. This removes
the need for the FIXUP_TOP_OF_STACK and RESTORE_TOP_OF_STACK macros.

Signed-off-by: Alexander van Heukelum <heuke...@fastmail.fm>
---
 arch/x86/kernel/entry_64.S | 75 +++++++++++++---------------------------------
 1 file changed, 21 insertions(+), 54 deletions(-)

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 6b95c2f..c4cb8f1 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -33,8 +33,6 @@
  * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS.
  * Gives a full stack frame.
  * - ENTRY/END Define functions in the symbol table.
- * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
- * frame that is otherwise undefined after a SYSCALL
  * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
  * - idtentry - Define exception entry points.
  */
@@ -130,33 +128,6 @@ ENDPROC(native_usergs_sysret64)
 #endif
 
 /*
- * C code is not supposed to know about undefined top of stack. Every time
- * a C function with an pt_regs argument is called from the SYSCALL based
- * fast path FIXUP_TOP_OF_STACK is needed.
- * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
- * manipulation.
- */
-
-       /* %rsp:at FRAMEEND */
-       .macro FIXUP_TOP_OF_STACK tmp offset=0
-       movq PER_CPU_VAR(old_rsp),\tmp
-       movq \tmp,RSP+\offset(%rsp)
-       movq $__USER_DS,SS+\offset(%rsp)
-       movq $__USER_CS,CS+\offset(%rsp)
-       movq RIP+\offset(%rsp),\tmp  /* get rip */
-       movq \tmp,RCX+\offset(%rsp)  /* copy it to rcx as sysret would do */
-       movq R11+\offset(%rsp),\tmp  /* get eflags */
-       movq \tmp,EFLAGS+\offset(%rsp)
-       .endm
-
-       .macro RESTORE_TOP_OF_STACK tmp offset=0
-       movq RSP+\offset(%rsp),\tmp
-       movq \tmp,PER_CPU_VAR(old_rsp)
-       movq EFLAGS+\offset(%rsp),\tmp
-       movq \tmp,R11+\offset(%rsp)
-       .endm
-
-/*
  * initial frame state for interrupts (and exceptions without error code)
  */
        .macro EMPTY_FRAME start=1 offset=0
@@ -272,7 +243,6 @@ ENTRY(ret_from_fork)
        testl $_TIF_IA32, TI_flags(%rcx)        # 32-bit compat task needs IRET
        jnz  int_ret_from_sys_call
 
-       RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
        jmp ret_from_sys_call                   # go to the SYSRET fastpath
 
 1:
@@ -339,10 +309,24 @@ GLOBAL(system_call_after_swapgs)
         * and short:
         */
        ENABLE_INTERRUPTS(CLBR_NONE)
-       SAVE_ARGS 6*8, 0, rax_enosys=1  /* skip: hardware stackframe and 
orig_rax */
+       /*
+        * Save user mode rsp (temporarily saved above in old_rsp),
+        * rflags (%r11), rip (%rcx) and segments (fixed values) on
+        * the stack as a regular interrupt frame.
+        */
+       pushq_cfi $__USER_DS
+       /* CFI_REL_OFFSET ss, 0 */
+       pushq_cfi PER_CPU_VAR(old_rsp)
+       CFI_REL_OFFSET rsp, 0
+       pushq_cfi %r11 /* %r11 clobbered (userspace %rflags) */
+       /* CFI_REL_OFFSET rflags, 0 */
+       pushq_cfi $__USER_CS
+       /* CFI_REL_OFFSET cs, 0 */
+       pushq_cfi %rcx /* %rcx clobbered (userspace %rip) */
+       CFI_REL_OFFSET rip, 0
+
+       SAVE_ARGS 8, rax_enosys=1
        movq_cfi rax,(ORIG_RAX-ARGOFFSET)
-       movq  %rcx,RIP-ARGOFFSET(%rsp)
-       CFI_REL_OFFSET rip,RIP-ARGOFFSET
        testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,ARGOFFSET)
        jnz tracesys
 system_call_fastpath:
@@ -362,7 +346,7 @@ system_call_fastpath:
  */
 ret_from_sys_call:
        testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,ARGOFFSET)
-       jnz int_ret_from_sys_call_fixup /* Go the the slow path */
+       jnz int_ret_from_sys_call       /* Go the the slow path */
 
        LOCKDEP_SYS_EXIT
        DISABLE_INTERRUPTS(CLBR_NONE)
@@ -372,19 +356,16 @@ ret_from_sys_call:
         * sysretq will re-enable interrupts:
         */
        TRACE_IRQS_ON
+       RESTORE_ARGS addskip=-ARG_SKIP, rstor_rcx=0, rstor_r11=0
        movq RIP-ARGOFFSET(%rsp),%rcx
        CFI_REGISTER    rip,rcx
-       RESTORE_ARGS 1,-ARG_SKIP,0
+       mov EFLAGS-ARGOFFSET(%rsp), %r11
        /*CFI_REGISTER  rflags,r11*/
-       movq    PER_CPU_VAR(old_rsp), %rsp
+       mov RSP-ARGOFFSET(%rsp), %rsp
        USERGS_SYSRET64
 
        CFI_RESTORE_STATE
 
-int_ret_from_sys_call_fixup:
-       FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
-       jmp int_ret_from_sys_call
-
        /* Do syscall tracing */
 tracesys:
        leaq -REST_SKIP(%rsp), %rdi
@@ -397,7 +378,6 @@ tracesys:
 
 tracesys_phase2:
        SAVE_REST
-       FIXUP_TOP_OF_STACK %rdi
        movq %rsp, %rdi
        movq $AUDIT_ARCH_X86_64, %rsi
        movq %rax,%rdx
@@ -493,10 +473,8 @@ ENTRY(stub_\func)
        PARTIAL_FRAME 0
        SAVE_REST
        pushq   %r11                    /* put it back on stack */
-       FIXUP_TOP_OF_STACK %r11, 8
        DEFAULT_FRAME 0 8               /* offset 8: return address */
        call sys_\func
-       RESTORE_TOP_OF_STACK %r11, 8
        ret $REST_SKIP          /* pop extended registers */
        CFI_ENDPROC
 END(stub_\func)
@@ -506,9 +484,7 @@ END(stub_\func)
 ENTRY(\label)
        CFI_STARTPROC
        PARTIAL_FRAME 0 8               /* offset 8: return address */
-       FIXUP_TOP_OF_STACK %r11, 8-ARGOFFSET
        call \func
-       RESTORE_TOP_OF_STACK %r11, 8-ARGOFFSET
        ret
        CFI_ENDPROC
 END(\label)
@@ -524,7 +500,6 @@ ENTRY(stub_execve)
        addq $8, %rsp
        PARTIAL_FRAME 0
        SAVE_REST
-       FIXUP_TOP_OF_STACK %r11
        call sys_execve
        movq %rax,RAX(%rsp)
        RESTORE_REST
@@ -537,9 +512,7 @@ ENTRY(stub_execveat)
        addq $8, %rsp
        PARTIAL_FRAME 0
        SAVE_REST
-       FIXUP_TOP_OF_STACK %r11
        call sys_execveat
-       RESTORE_TOP_OF_STACK %r11
        movq %rax,RAX(%rsp)
        RESTORE_REST
        jmp int_ret_from_sys_call
@@ -555,7 +528,6 @@ ENTRY(stub_rt_sigreturn)
        addq $8, %rsp
        PARTIAL_FRAME 0
        SAVE_REST
-       FIXUP_TOP_OF_STACK %r11
        call sys_rt_sigreturn
        movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
        RESTORE_REST
@@ -569,7 +541,6 @@ ENTRY(stub_x32_rt_sigreturn)
        addq $8, %rsp
        PARTIAL_FRAME 0
        SAVE_REST
-       FIXUP_TOP_OF_STACK %r11
        call sys32_x32_rt_sigreturn
        movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
        RESTORE_REST
@@ -582,9 +553,7 @@ ENTRY(stub_x32_execve)
        addq $8, %rsp
        PARTIAL_FRAME 0
        SAVE_REST
-       FIXUP_TOP_OF_STACK %r11
        call compat_sys_execve
-       RESTORE_TOP_OF_STACK %r11
        movq %rax,RAX(%rsp)
        RESTORE_REST
        jmp int_ret_from_sys_call
@@ -596,9 +565,7 @@ ENTRY(stub_x32_execveat)
        addq $8, %rsp
        PARTIAL_FRAME 0
        SAVE_REST
-       FIXUP_TOP_OF_STACK %r11
        call compat_sys_execveat
-       RESTORE_TOP_OF_STACK %r11
        movq %rax,RAX(%rsp)
        RESTORE_REST
        jmp int_ret_from_sys_call
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to