On Tue, Aug 23, 2011 at 06:33:17PM +0100, Al Viro wrote:
>       * asm glue is subtle, evil and doesn't have anywhere near enough
> documentation ;-/

I took the liberty to document some of your asm glue analysis in an
attempt to make the code a bit more understandable. How about the
following:

--
From: Borislav Petkov <borislav.pet...@amd.com>
Date: Wed, 24 Aug 2011 14:30:43 +0200
Subject: [PATCH] x86, asm: Document some of the syscall asm glue

Document some of the asm glue around compat SYSCALL32 and do a
whitespace cleanup while at it. See linked thread below for further
reference.

Link: http://lkml.kernel.org/r/20110820011845.gc2...@zeniv.linux.org.uk
Signed-off-by: Borislav Petkov <borislav.pet...@amd.com>
---
 arch/x86/ia32/ia32entry.S  |  138 ++++++++++++++++++++++++++-----------------
 arch/x86/kernel/entry_64.S |   19 +++++-
 2 files changed, 98 insertions(+), 59 deletions(-)

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index a0e866d..8254432 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -1,16 +1,16 @@
 /*
- * Compatibility mode system call entry point for x86-64. 
- *             
+ * Compatibility mode system call entry point for x86-64.
+ *
  * Copyright 2000-2002 Andi Kleen, SuSE Labs.
- */             
+ */
 
 #include <asm/dwarf2.h>
 #include <asm/calling.h>
 #include <asm/asm-offsets.h>
 #include <asm/current.h>
 #include <asm/errno.h>
-#include <asm/ia32_unistd.h>   
-#include <asm/thread_info.h>   
+#include <asm/ia32_unistd.h>
+#include <asm/thread_info.h>
 #include <asm/segment.h>
 #include <asm/irqflags.h>
 #include <linux/linkage.h>
@@ -38,11 +38,11 @@
        xchg    %ecx,%esi
        movl    %ebx,%edi
        movl    %edx,%edx       /* zero extension */
-       .endm 
+       .endm
 
-       /* clobbers %eax */     
+       /* clobbers %eax */
        .macro  CLEAR_RREGS offset=0, _r9=rax
-       xorl    %eax,%eax
+       xorl    %eax,%eax
        movq    %rax,\offset+R11(%rsp)
        movq    %rax,\offset+R10(%rsp)
        movq    %\_r9,\offset+R9(%rsp)
@@ -69,7 +69,7 @@
        movl \offset+64(%rsp),%edi
        movl %eax,%eax                  /* zero extension */
        .endm
-       
+
        .macro CFI_STARTPROC32 simple
        CFI_STARTPROC   \simple
        CFI_UNDEFINED   r8
@@ -106,14 +106,14 @@ ENDPROC(native_irq_enable_sysexit)
  * %esi Arg4
  * %edi Arg5
  * %ebp user stack
- * 0(%ebp) Arg6        
- *     
+ * 0(%ebp) Arg6
+ *
  * Interrupts off.
- *     
+ *
  * This is purely a fast path. For anything complicated we use the int 0x80
  * path below. Set up a complete hardware stack frame to share code
  * with the int 0x80 path.
- */    
+ */
 ENTRY(ia32_sysenter_target)
        CFI_STARTPROC32 simple
        CFI_SIGNAL_FRAME
@@ -127,7 +127,7 @@ ENTRY(ia32_sysenter_target)
         * disabled irqs, here we enable it straight after entry:
         */
        ENABLE_INTERRUPTS(CLBR_NONE)
-       movl    %ebp,%ebp               /* zero extension */
+       movl    %ebp,%ebp               /* zero extension */
        pushq_cfi $__USER32_DS
        /*CFI_REL_OFFSET ss,0*/
        pushq_cfi %rbp
@@ -144,12 +144,12 @@ ENTRY(ia32_sysenter_target)
        pushq_cfi %rax
        cld
        SAVE_ARGS 0,1,0
-       /* no need to do an access_ok check here because rbp has been
-          32bit zero extended */ 
+       /* no need to do an access_ok check here because rbp has been
+          32bit zero extended */
 1:     movl    (%rbp),%ebp
-       .section __ex_table,"a"
-       .quad 1b,ia32_badarg
-       .previous       
+       .section __ex_table,"a"
+       .quad 1b,ia32_badarg
+       .previous
        GET_THREAD_INFO(%r10)
        orl    $TS_COMPAT,TI_status(%r10)
        testl  $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
@@ -170,7 +170,7 @@ sysenter_dispatch:
 sysexit_from_sys_call:
        andl    $~TS_COMPAT,TI_status(%r10)
        /* clear IF, that popfq doesn't enable interrupts early */
-       andl  $~0x200,EFLAGS-R11(%rsp) 
+       andl  $~0x200,EFLAGS-R11(%rsp)
        movl    RIP-R11(%rsp),%edx              /* User %eip */
        CFI_REGISTER rip,rdx
        RESTORE_ARGS 0,24,0,0,0,0
@@ -260,20 +260,21 @@ ENDPROC(ia32_sysenter_target)
  * Arguments:
  * %eax        System call number.
  * %ebx Arg1
- * %ecx return EIP 
+ * %ecx return EIP
  * %edx Arg3
  * %esi Arg4
  * %edi Arg5
- * %ebp Arg2    [note: not saved in the stack frame, should not be touched]
- * %esp user stack 
+ * %ebp Arg2    [note: not saved in the stack frame, should not be touched
+ *              because it is callee-saved in 64-bit calling convention]
+ * %esp user stack
  * 0(%esp) Arg6
- *     
+ *
  * Interrupts off.
- *     
+ *
  * This is purely a fast path. For anything complicated we use the int 0x80
  * path below. Set up a complete hardware stack frame to share code
- * with the int 0x80 path.     
- */    
+ * with the int 0x80 path.
+ */
 ENTRY(ia32_cstar_target)
        CFI_STARTPROC32 simple
        CFI_SIGNAL_FRAME
@@ -281,34 +282,57 @@ ENTRY(ia32_cstar_target)
        CFI_REGISTER    rip,rcx
        /*CFI_REGISTER  rflags,r11*/
        SWAPGS_UNSAFE_STACK
+
+       /* stash away usermode stack ptr */
        movl    %esp,%r8d
        CFI_REGISTER    rsp,r8
        movq    PER_CPU_VAR(kernel_stack),%rsp
+
        /*
         * No need to follow this irqs on/off section: the syscall
         * disabled irqs and here we enable it straight after entry:
         */
        ENABLE_INTERRUPTS(CLBR_NONE)
        SAVE_ARGS 8,0,0
-       movl    %eax,%eax       /* zero extension */
+       movl    %eax,%eax       /* zero extension */
        movq    %rax,ORIG_RAX-ARGOFFSET(%rsp)
+
+       /* return-RIP is in %ecx when executing SYSCALL */
        movq    %rcx,RIP-ARGOFFSET(%rsp)
        CFI_REL_OFFSET rip,RIP-ARGOFFSET
-       movq    %rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */
+
+       /*
+        * Put Arg2 into %rcx pt_regs slot to match kernel syscall
+        * calling conventions, i.e. what INT80 would expect;
+        * this lies slightly to ptrace
+        */
+       movq    %rbp,RCX-ARGOFFSET(%rsp)
        movl    %ebp,%ecx
        movq    $__USER32_CS,CS-ARGOFFSET(%rsp)
        movq    $__USER32_DS,SS-ARGOFFSET(%rsp)
+
+       /* rFLAGS is in %r11 when executing SYSCALL */
        movq    %r11,EFLAGS-ARGOFFSET(%rsp)
        /*CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/
-       movq    %r8,RSP-ARGOFFSET(%rsp) 
+
+       /* save usermode stack ptr into pt_regs */
+       movq    %r8,RSP-ARGOFFSET(%rsp)
        CFI_REL_OFFSET rsp,RSP-ARGOFFSET
-       /* no need to do an access_ok check here because r8 has been
-          32bit zero extended */ 
-       /* hardware stack frame is complete now */      
+
+       /*
+        * Get Arg6 which is on the usermode stack; no need to do an
+        * access_ok check here because %r8 has been 32bit zero extended.
+        * hardware stack frame is complete now.
+        */
 1:     movl    (%r8),%r9d
+
+       /*
+        * handle pagefaulting when accessing usermode stack by returning
+        * -EFAULT
+        */
        .section __ex_table,"a"
        .quad 1b,ia32_badarg
-       .previous       
+       .previous
        GET_THREAD_INFO(%r10)
        orl   $TS_COMPAT,TI_status(%r10)
        testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
@@ -331,7 +355,7 @@ sysretl_from_sys_call:
        RESTORE_ARGS 0,-ARG_SKIP,0,0,0
        movl RIP-ARGOFFSET(%rsp),%ecx
        CFI_REGISTER rip,rcx
-       movl EFLAGS-ARGOFFSET(%rsp),%r11d       
+       movl EFLAGS-ARGOFFSET(%rsp),%r11d
        /*CFI_REGISTER rflags,r11*/
        xorq    %r10,%r10
        xorq    %r9,%r9
@@ -340,7 +364,7 @@ sysretl_from_sys_call:
        movl RSP-ARGOFFSET(%rsp),%esp
        CFI_RESTORE rsp
        USERGS_SYSRET32
-       
+
 #ifdef CONFIG_AUDITSYSCALL
 cstar_auditsys:
        CFI_RESTORE_STATE
@@ -358,6 +382,8 @@ cstar_tracesys:
        testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
        jz cstar_auditsys
 #endif
+
+       /* put Arg6 into %ebp where ptrace expects it */
        xchgl %r9d,%ebp
        SAVE_REST
        CLEAR_RREGS 0, r9
@@ -366,21 +392,23 @@ cstar_tracesys:
        call syscall_trace_enter
        LOAD_ARGS32 ARGOFFSET, 1  /* reload args from stack in case ptrace 
changed it */
        RESTORE_REST
+
+       /* sync back Arg6's possibly changed value where it is expected by C */
        xchgl %ebp,%r9d
        cmpq $(IA32_NR_syscalls-1),%rax
        ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */
        jmp cstar_do_call
 END(ia32_cstar_target)
-                               
+
 ia32_badarg:
        movq $-EFAULT,%rax
        jmp ia32_sysret
        CFI_ENDPROC
 
-/* 
- * Emulated IA32 system calls via int 0x80. 
+/*
+ * Emulated IA32 system calls via int 0x80.
  *
- * Arguments:   
+ * Arguments:
  * %eax        System call number.
  * %ebx Arg1
  * %ecx Arg2
@@ -390,13 +418,13 @@ ia32_badarg:
  * %ebp Arg6    [note: not saved in the stack frame, should not be touched]
  *
  * Notes:
- * Uses the same stack frame as the x86-64 version.    
+ * Uses the same stack frame as the x86-64 version.
  * All registers except %eax must be saved (but ptrace may violate that)
  * Arguments are zero extended. For system calls that want sign extension and
  * take long arguments a wrapper is needed. Most calls can just be called
  * directly.
- * Assumes it is only called from user space and entered with interrupts off.  
- */                            
+ * Assumes it is only called from user space and entered with interrupts off.
+ */
 
 ENTRY(ia32_syscall)
        CFI_STARTPROC32 simple
@@ -433,9 +461,9 @@ ia32_sysret:
        movq %rax,RAX-ARGOFFSET(%rsp)
 ia32_ret_from_sys_call:
        CLEAR_RREGS -ARGOFFSET
-       jmp int_ret_from_sys_call 
+       jmp int_ret_from_sys_call
 
-ia32_tracesys:                  
+ia32_tracesys:
        SAVE_REST
        CLEAR_RREGS
        movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
@@ -457,13 +485,13 @@ quiet_ni_syscall:
        movq $-ENOSYS,%rax
        ret
        CFI_ENDPROC
-       
+
        .macro PTREGSCALL label, func, arg
        .globl \label
 \label:
        leaq \func(%rip),%rax
        leaq -ARGOFFSET+8(%rsp),\arg    /* 8 for return address */
-       jmp  ia32_ptregs_common 
+       jmp  ia32_ptregs_common
        .endm
 
        CFI_STARTPROC32
@@ -537,7 +565,7 @@ ia32_sys_call_table:
        .quad quiet_ni_syscall  /* old stty syscall holder */
        .quad quiet_ni_syscall  /* old gtty syscall holder */
        .quad sys_access
-       .quad sys_nice  
+       .quad sys_nice
        .quad quiet_ni_syscall  /* 35 */        /* old ftime syscall holder */
        .quad sys_sync
        .quad sys32_kill
@@ -616,7 +644,7 @@ ia32_sys_call_table:
        .quad stub32_iopl               /* 110 */
        .quad sys_vhangup
        .quad quiet_ni_syscall  /* old "idle" system call */
-       .quad sys32_vm86_warning        /* vm86old */ 
+       .quad sys32_vm86_warning        /* vm86old */
        .quad compat_sys_wait4
        .quad sys_swapoff               /* 115 */
        .quad compat_sys_sysinfo
@@ -669,7 +697,7 @@ ia32_sys_call_table:
        .quad sys_mremap
        .quad sys_setresuid16
        .quad sys_getresuid16   /* 165 */
-       .quad sys32_vm86_warning        /* vm86 */ 
+       .quad sys32_vm86_warning        /* vm86 */
        .quad quiet_ni_syscall  /* query_module */
        .quad sys_poll
        .quad compat_sys_nfsservctl
@@ -724,10 +752,10 @@ ia32_sys_call_table:
        .quad sys_mincore
        .quad sys_madvise
        .quad compat_sys_getdents64     /* 220 getdents64 */
-       .quad compat_sys_fcntl64        
+       .quad compat_sys_fcntl64
        .quad quiet_ni_syscall          /* tux */
-       .quad quiet_ni_syscall          /* security */
-       .quad sys_gettid        
+       .quad quiet_ni_syscall          /* security */
+       .quad sys_gettid
        .quad sys32_readahead   /* 225 */
        .quad sys_setxattr
        .quad sys_lsetxattr
@@ -742,7 +770,7 @@ ia32_sys_call_table:
        .quad sys_lremovexattr
        .quad sys_fremovexattr
        .quad sys_tkill
-       .quad sys_sendfile64 
+       .quad sys_sendfile64
        .quad compat_sys_futex          /* 240 */
        .quad compat_sys_sched_setaffinity
        .quad compat_sys_sched_getaffinity
@@ -754,7 +782,7 @@ ia32_sys_call_table:
        .quad compat_sys_io_submit
        .quad sys_io_cancel
        .quad sys32_fadvise64           /* 250 */
-       .quad quiet_ni_syscall  /* free_huge_pages */
+       .quad quiet_ni_syscall  /* free_huge_pages */
        .quad sys_exit_group
        .quad sys32_lookup_dcookie
        .quad sys_epoll_create
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 6419bb0..9569f11 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -607,10 +607,16 @@ tracesys:
 GLOBAL(int_ret_from_sys_call)
        DISABLE_INTERRUPTS(CLBR_NONE)
        TRACE_IRQS_OFF
+
+       /*
+        * check the Requestor Privilege Level of the CS selector
+        * previously pushed on the stack. If 0, we're returning
+        * to kernel space.
+        */
        testl $3,CS-ARGOFFSET(%rsp)
        je retint_restore_args
-       movl $_TIF_ALLWORK_MASK,%edi
        /* edi: mask to check */
+       movl $_TIF_ALLWORK_MASK,%edi
 GLOBAL(int_with_check)
        LOCKDEP_SYS_EXIT_IRQ
        GET_THREAD_INFO(%rcx)
@@ -618,11 +624,16 @@ GLOBAL(int_with_check)
        andl %edi,%edx
        jnz   int_careful
        andl    $~TS_COMPAT,TI_status(%rcx)
+
+       /* no work pending, return to userspace */
        jmp   retint_swapgs
 
-       /* Either reschedule or signal or syscall exit tracking needed. */
-       /* First do a reschedule test. */
-       /* edx: work, edi: workmask */
+       /*
+        * Either reschedule or signal or syscall exit tracking
+        * needed. First do a reschedule test.
+        *
+        * edx: work, edi: workmask
+        */
 int_careful:
        bt $TIF_NEED_RESCHED,%edx
        jnc  int_very_careful
-- 
1.7.4


-- 
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551


------------------------------------------------------------------------------
EMC VNX: the world's simplest storage, starting under $10K
The only unified storage solution that offers unified management 
Up to 160% more powerful than alternatives and 25% more efficient. 
Guaranteed. http://p.sf.net/sfu/emc-vnx-dev2dev
_______________________________________________
User-mode-linux-devel mailing list
User-mode-linux-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel

Reply via email to