On Wed, May 01, 2019 at 11:24:12PM -0400, Steven Rostedt wrote:
> On Wed, 01 May 2019 16:28:31 -0400
> Steven Rostedt <rost...@goodmis.org> wrote:
> 
> > diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
> > index d309f30cf7af..50bbf4035baf 100644
> > --- a/arch/x86/entry/entry_32.S
> > +++ b/arch/x86/entry/entry_32.S
> > @@ -1478,6 +1478,17 @@ ENTRY(int3)
> >     ASM_CLAC
> >     pushl   $-1                             # mark this as an int
> >  
> > +#ifdef CONFIG_VM86
> > +   testl   $X86_EFLAGS_VM, PT_EFLAGS(%esp)
> > +   jnz     .Lfrom_usermode_no_gap
> > +#endif
> > +   testl   $SEGMENT_RPL_MASK, PT_CS(%esp)
> > +   jnz     .Lfrom_usermode_no_gap
> > +   .rept 6
> > +   pushl   5*4(%esp)
> > +   .endr
> > +.Lfrom_usermode_no_gap:
> > +
> >     SAVE_ALL switch_stacks=1
> >     ENCODE_FRAME_POINTER
> >     TRACE_IRQS_OFF
> 
> This failed to work on 32 bit at all (crashed and burned badly - triple
> fault!). 

Indeed so; find a working version below (albeit with a lot of debug
garbage still in).

It also includes the self-test code that Andy wanted -- it's what I used
to debug this mess.

Much thanks to Joerg Roedel for talking entry_32.S with me.

TL;DR, on x86_32 kernel->kernel IRET frames are only 3 entries and do
not include ESP/SS, so not only wasn't regs->sp setup, if you changed it
it wouldn't be effective and corrupt random stack state.

---
 arch/x86/entry/entry_32.S            |  87 +++++++++++++++++++++++---
 arch/x86/entry/entry_64.S            |  14 ++++-
 arch/x86/include/asm/text-patching.h |  20 ++++++
 arch/x86/kernel/alternative.c        | 116 +++++++++++++++++++++++++++++++++--
 arch/x86/kernel/traps.c              |   1 +
 5 files changed, 225 insertions(+), 13 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 7b23431be5cb..01c5bdbe5f39 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -203,7 +203,7 @@
 .Lend_\@:
 .endm
 
-.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0
+.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 clear_cs=1
        cld
        PUSH_GS
        pushl   %fs
@@ -225,7 +225,7 @@
 
        /* Switch to kernel stack if necessary */
 .if \switch_stacks > 0
-       SWITCH_TO_KERNEL_STACK
+       SWITCH_TO_KERNEL_STACK \clear_cs
 .endif
 
 .endm
@@ -377,8 +377,9 @@
 
 #define CS_FROM_ENTRY_STACK    (1 << 31)
 #define CS_FROM_USER_CR3       (1 << 30)
+#define CS_FROM_INT3           (1 << 29)
 
-.macro SWITCH_TO_KERNEL_STACK
+.macro SWITCH_TO_KERNEL_STACK clear_cs=1
 
        ALTERNATIVE     "", "jmp .Lend_\@", X86_FEATURE_XENPV
 
@@ -391,12 +392,13 @@
         * that register for the time this macro runs
         */
 
+       .if \clear_cs
        /*
-        * The high bits of the CS dword (__csh) are used for
-        * CS_FROM_ENTRY_STACK and CS_FROM_USER_CR3. Clear them in case
-        * hardware didn't do this for us.
+        * The high bits of the CS dword (__csh) are used for CS_FROM_*. Clear
+        * them in case hardware didn't do this for us.
         */
        andl    $(0x0000ffff), PT_CS(%esp)
+       .endif
 
        /* Are we on the entry stack? Bail out if not! */
        movl    PER_CPU_VAR(cpu_entry_area), %ecx
@@ -1019,6 +1021,29 @@ ENTRY(entry_INT80_32)
        /* Restore user state */
        RESTORE_REGS pop=4                      # skip orig_eax/error_code
 .Lirq_return:
+       testl $CS_FROM_INT3, 4(%esp)
+       jz .Lno_iret_fixup
+
+       /*
+        * Undo the magic from ENTRY(int3), in particular consider the case
+        * where regs->sp has been modified.
+        */
+
+       pushl   %eax
+       movl    %esp, %eax
+
+       movl    4*4(%eax), %esp         # restore (modified) regs->sp
+
+       /* rebuild IRET frame */
+       pushl   3*4(%eax)               # flags
+       pushl   2*4(%eax)               # cs
+       pushl   1*4(%eax)               # ip
+
+       andl    $0x0000ffff, 4(%esp)    # clear high CS bits
+
+       movl    (%eax), %eax            # restore eax
+
+.Lno_iret_fixup:
        /*
         * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization
         * when returning from IPI handler and when returning from
@@ -1477,9 +1502,57 @@ END(nmi)
 
 ENTRY(int3)
        ASM_CLAC
+
+       /*
+        * The high bits of the CS dword (__csh) are used for CS_FROM_*. Clear
+        * them in case hardware didn't do this for us.
+        */
+       andl    $0x0000ffff, 4(%esp)
+
+#ifdef CONFIG_VM86
+       testl   $X86_EFLAGS_VM, 8(%esp)
+       jnz     .Lfrom_usermode_no_gap
+#endif
+       testl   $SEGMENT_RPL_MASK, 4(%esp)
+       jnz     .Lfrom_usermode_no_gap
+
+       /*
+        * Here from kernel mode; so the (exception) stack looks like:
+        *
+        * 12(esp) - <previous context>
+        *  8(esp) - flags
+        *  4(esp) - cs
+        *  0(esp) - ip
+        *
+        * Lets build a 5 entry IRET frame after that, such that struct pt_regs
+        * is complete and in particular regs->sp is correct. This gives us
+        * the original 3 enties as gap:
+        *
+        * 32(esp) - <previous context>
+        * 28(esp) - orig_flags / gap
+        * 24(esp) - orig_cs    / gap
+        * 20(esp) - orig_ip    / gap
+        * 16(esp) - ss
+        * 12(esp) - sp
+        *  8(esp) - flags
+        *  4(esp) - cs
+        *  0(esp) - ip
+        */
+       pushl   %ss       # ss
+       pushl   %esp      # sp (points at ss)
+       pushl   4*4(%esp) # flags
+       pushl   4*4(%esp) # cs
+       pushl   4*4(%esp) # ip
+
+       add     $16, 12(%esp) # point sp back at the previous context
+
+       orl     $CS_FROM_INT3, 4(%esp) # mark magic IRET
+
+.Lfrom_usermode_no_gap:
+
        pushl   $-1                             # mark this as an int
 
-       SAVE_ALL switch_stacks=1
+       SAVE_ALL switch_stacks=1 clear_cs=0
        ENCODE_FRAME_POINTER
        TRACE_IRQS_OFF
        xorl    %edx, %edx                      # zero error code
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 20e45d9b4e15..268cd9affe04 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -878,7 +878,7 @@ apicinterrupt IRQ_WORK_VECTOR                       
irq_work_interrupt              smp_irq_work_interrupt
  * @paranoid == 2 is special: the stub will never switch stacks.  This is for
  * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS.
  */
-.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 
ist_offset=0
+.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 
ist_offset=0 create_gap=0
 ENTRY(\sym)
        UNWIND_HINT_IRET_REGS offset=\has_error_code*8
 
@@ -898,6 +898,16 @@ ENTRY(\sym)
        jnz     .Lfrom_usermode_switch_stack_\@
        .endif
 
+       .if \create_gap == 1
+       testb   $3, CS-ORIG_RAX(%rsp)
+       jnz     .Lfrom_usermode_no_gap_\@
+       .rept 6
+       pushq   5*8(%rsp)
+       .endr
+       UNWIND_HINT_IRET_REGS offset=8
+.Lfrom_usermode_no_gap_\@:
+       .endif
+
        .if \paranoid
        call    paranoid_entry
        .else
@@ -1129,7 +1139,7 @@ apicinterrupt3 HYPERV_STIMER0_VECTOR \
 #endif /* CONFIG_HYPERV */
 
 idtentry debug                 do_debug                has_error_code=0        
paranoid=1 shift_ist=IST_INDEX_DB ist_offset=DB_STACK_OFFSET
-idtentry int3                  do_int3                 has_error_code=0
+idtentry int3                  do_int3                 has_error_code=0        
create_gap=1
 idtentry stack_segment         do_stack_segment        has_error_code=1
 
 #ifdef CONFIG_XEN_PV
diff --git a/arch/x86/include/asm/text-patching.h 
b/arch/x86/include/asm/text-patching.h
index c90678fd391a..6aac6abf931e 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -42,4 +42,24 @@ extern int after_bootmem;
 extern __ro_after_init struct mm_struct *poking_mm;
 extern __ro_after_init unsigned long poking_addr;
 
+static inline void int3_emulate_push(struct pt_regs *regs, unsigned long val)
+{
+       regs->sp -= sizeof(unsigned long);
+       *(unsigned long *)regs->sp = val;
+}
+
+static inline void int3_emulate_jmp(struct pt_regs *regs, unsigned long ip)
+{
+       regs->ip = ip;
+}
+
+#define INT3_INSN_SIZE 1
+#define CALL_INSN_SIZE 5
+
+static inline void int3_emulate_call(struct pt_regs *regs, unsigned long func)
+{
+       int3_emulate_push(regs, regs->ip - INT3_INSN_SIZE + CALL_INSN_SIZE);
+       int3_emulate_jmp(regs, func);
+}
+
 #endif /* _ASM_X86_TEXT_PATCHING_H */
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 4db9c0d29bc1..1e11076c3a2b 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -613,11 +613,118 @@ extern struct paravirt_patch_site 
__start_parainstructions[],
        __stop_parainstructions[];
 #endif /* CONFIG_PARAVIRT */
 
+static __always_inline void print_stack(struct pt_regs *regs)
+{
+#if 1
+       unsigned long *end = (unsigned long *)current_stack_pointer;
+       unsigned long *frame = (unsigned long *)__builtin_frame_address(0);
+       unsigned long *stack = (unsigned long *)(current_stack_pointer & 
~(THREAD_SIZE - 1));
+       int i, j;
+
+       stack += THREAD_SIZE / sizeof(unsigned long);
+
+       printk("stack dump from: %lx\n", stack);
+
+       for (i=0; ; i++) {
+               pr_info("stack[%03d]: ", 16*i);
+               for (j=0; j<16; j++) {
+                       if (i==0 && j==0) {
+                               pr_cont(" %08lx  ", 0UL);
+                               stack--;
+                               continue;
+                       }
+                       if (stack == end)
+                               pr_cont(">%08lx< ", *(stack--));
+                       else if (stack == frame)
+                               pr_cont("*%08lx* ", *(stack--));
+                       else if (stack == regs)
+                               pr_cont("r%08lxr ", *(stack--));
+                       else if (regs && stack == regs->sp)
+                               pr_cont("s%08lxs ", *(stack--));
+                       else
+                               pr_cont(" %08lx  ", *(stack--));
+               }
+               pr_cont("\n");
+
+               if (stack < end)
+                       break;
+       }
+#endif
+}
+
+static void __init int3_magic(unsigned int *ptr)
+{
+       printk("*************** %lx\n", (unsigned long)ptr);
+       print_stack(NULL);
+       *ptr = 1;
+}
+
+static __initdata unsigned long int3_ip;
+
+static int __init int3_exception_notify(struct notifier_block *self, unsigned 
long val, void *data)
+{
+       struct die_args *args = data;
+       struct pt_regs *regs = args->regs;
+
+       if (!regs || user_mode(regs))
+               return NOTIFY_DONE;
+
+       if (val != DIE_INT3)
+               return NOTIFY_DONE;
+
+       printk("XXXXXXXXXXXXXXXXXXXXXXXXXX %lx %lx\n", regs->ip, int3_ip);
+       if (regs->ip - INT3_INSN_SIZE != int3_ip)
+               return NOTIFY_DONE;
+
+       print_stack(regs);
+       int3_emulate_call(regs, (unsigned long)&int3_magic);
+       print_stack(regs);
+
+       return NOTIFY_STOP;
+}
+
+static void __init int3_selftest(void)
+{
+       static __initdata struct notifier_block int3_exception_nb = {
+               .notifier_call  = int3_exception_notify,
+               .priority       = INT_MAX-1, /* last */
+       };
+       unsigned int val = 0;
+
+       BUG_ON(register_die_notifier(&int3_exception_nb));
+
+       printk("+++++++++++++++++++ %lx %lx\n", (unsigned long)&val, (unsigned 
long)&int3_ip);
+
+       print_stack(NULL);
+
+       /*
+        * Basically: int3_magic(&val); but really complicated :-)
+        *
+        * Stick the address of the INT3 instruction into int3_ip, then trigger
+        * the INT3, padded with NOPs to match a CALL instruction length.
+        */
+#ifdef CONFIG_X86_32
+       asm volatile ("call 1f; 1: pop (%%edx); add $5, (%%edx);"
+                     "int3; nop; nop; nop; nop" : : "d" (&int3_ip), "a" (&val) 
: "memory");
+#else /* CONFIG_X86_64 */
+       asm volatile ("call 1f; 1: pop (%%rdx); add $5, (%%rdx);"
+                     "int3; nop; nop; nop; nop" : : "d" (&int3_ip), "D" (&val) 
: "memory");
+#endif
+
+       BUG_ON(val != 1);
+
+       unregister_die_notifier(&int3_exception_nb);
+}
+
 void __init alternative_instructions(void)
 {
-       /* The patching is not fully atomic, so try to avoid local interruptions
-          that might execute the to be patched code.
-          Other CPUs are not running. */
+       int3_selftest();
+
+       /*
+        * The patching is not fully atomic, so try to avoid local
+        * interruptions that might execute the to be patched code.
+        * Other CPUs are not running.
+        */
        stop_nmi();
 
        /*
@@ -642,10 +749,11 @@ void __init alternative_instructions(void)
                                            _text, _etext);
        }
 
-       if (!uniproc_patched || num_possible_cpus() == 1)
+       if (!uniproc_patched || num_possible_cpus() == 1) {
                free_init_pages("SMP alternatives",
                                (unsigned long)__smp_locks,
                                (unsigned long)__smp_locks_end);
+       }
 #endif
 
        apply_paravirt(__parainstructions, __parainstructions_end);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 8b6d03e55d2f..e072cdd07284 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -572,6 +572,7 @@ NOKPROBE_SYMBOL(do_general_protection);
 
 dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
 {
+       printk("int3 frame: %lx\n", __builtin_frame_address(0));
 #ifdef CONFIG_DYNAMIC_FTRACE
        /*
         * ftrace must be first, everything else may cause a recursive crash.

Reply via email to