Re: [PATCH v6 1/3] fork: Add generic vmalloced stack support

2016-08-30 Thread Josh Cartwright
On Tue, Aug 30, 2016 at 03:01:51PM -0700, Andy Lutomirski wrote:
> On Wed, Aug 24, 2016 at 9:51 AM, Josh Cartwright  wrote:
[..]
> >> diff --git a/kernel/fork.c b/kernel/fork.c
> >> index 52e725d4a866..05f7ef796fb4 100644
> >> --- a/kernel/fork.c
> >> +++ b/kernel/fork.c
> >> @@ -158,19 +158,39 @@ void __weak arch_release_thread_stack(unsigned long 
> >> *stack)
> >>   * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
> >>   * kmemcache based allocator.
> >>   */
> >> -# if THREAD_SIZE >= PAGE_SIZE
> >> -static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
> >> -   int node)
> >> +# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
> >> +static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, 
> >> int node)
> >>  {
> >> +#ifdef CONFIG_VMAP_STACK
> >> + void *stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE,
> >> +VMALLOC_START, VMALLOC_END,
> >> +THREADINFO_GFP | __GFP_HIGHMEM,
> >> +PAGE_KERNEL,
> >> +0, node,
> >> +__builtin_return_address(0));
> >> +
> >> + /*
> >> +  * We can't call find_vm_area() in interrupt context, and
> >> +  * free_thread_stack can be called in interrupt context, so cache
> >> +  * the vm_struct.
> >> +  */
> >> + if (stack)
> >> + tsk->stack_vm_area = find_vm_area(stack);
> >
> > This is annoying, we end up having to walk the vm_area tree twice (once
> > for the allocation, then here to get a handle on area).
> >
> > Perhaps it's time the vmalloc code learned an allocation API that
> > returned the vm_area handle as well.
> 
> Agreed.  I may do this once everything else lands.

There are at least a few other places that could benefit from this,
doing a quick scan of find_vm_area() callers: vmalloc_{32_,}_user() and
kasan_module_alloc().

  Josh


signature.asc
Description: PGP signature


Re: [PATCH v6 1/3] fork: Add generic vmalloced stack support

2016-08-30 Thread Andy Lutomirski
On Wed, Aug 24, 2016 at 9:51 AM, Josh Cartwright  wrote:
> Hey Andy-
>
> Small non-critical/potential future optimization comment below:
>
> On Thu, Aug 11, 2016 at 02:35:21AM -0700, Andy Lutomirski wrote:
>> If CONFIG_VMAP_STACK is selected, kernel stacks are allocated with
>> vmalloc_node.
>>
>> grsecurity has had a similar feature (called
>> GRKERNSEC_KSTACKOVERFLOW) for a long time.
>>
>> Cc: Oleg Nesterov 
>> Signed-off-by: Andy Lutomirski 
>> ---
> [..]
>> diff --git a/kernel/fork.c b/kernel/fork.c
>> index 52e725d4a866..05f7ef796fb4 100644
>> --- a/kernel/fork.c
>> +++ b/kernel/fork.c
>> @@ -158,19 +158,39 @@ void __weak arch_release_thread_stack(unsigned long 
>> *stack)
>>   * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
>>   * kmemcache based allocator.
>>   */
>> -# if THREAD_SIZE >= PAGE_SIZE
>> -static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
>> -   int node)
>> +# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
>> +static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int 
>> node)
>>  {
>> +#ifdef CONFIG_VMAP_STACK
>> + void *stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE,
>> +VMALLOC_START, VMALLOC_END,
>> +THREADINFO_GFP | __GFP_HIGHMEM,
>> +PAGE_KERNEL,
>> +0, node,
>> +__builtin_return_address(0));
>> +
>> + /*
>> +  * We can't call find_vm_area() in interrupt context, and
>> +  * free_thread_stack can be called in interrupt context, so cache
>> +  * the vm_struct.
>> +  */
>> + if (stack)
>> + tsk->stack_vm_area = find_vm_area(stack);
>
> This is annoying, we end up having to walk the vm_area tree twice (once
> for the allocation, then here to get a handle on area).
>
> Perhaps it's time the vmalloc code learned an allocation API that
> returned the vm_area handle as well.
>

Agreed.  I may do this once everything else lands.


Re: [PATCH v6 1/3] fork: Add generic vmalloced stack support

2016-08-24 Thread Josh Cartwright
Hey Andy-

Small non-critical/potential future optimization comment below:

On Thu, Aug 11, 2016 at 02:35:21AM -0700, Andy Lutomirski wrote:
> If CONFIG_VMAP_STACK is selected, kernel stacks are allocated with
> vmalloc_node.
> 
> grsecurity has had a similar feature (called
> GRKERNSEC_KSTACKOVERFLOW) for a long time.
> 
> Cc: Oleg Nesterov 
> Signed-off-by: Andy Lutomirski 
> ---
[..]
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 52e725d4a866..05f7ef796fb4 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -158,19 +158,39 @@ void __weak arch_release_thread_stack(unsigned long 
> *stack)
>   * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
>   * kmemcache based allocator.
>   */
> -# if THREAD_SIZE >= PAGE_SIZE
> -static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
> -   int node)
> +# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
> +static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int 
> node)
>  {
> +#ifdef CONFIG_VMAP_STACK
> + void *stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE,
> +VMALLOC_START, VMALLOC_END,
> +THREADINFO_GFP | __GFP_HIGHMEM,
> +PAGE_KERNEL,
> +0, node,
> +__builtin_return_address(0));
> +
> + /*
> +  * We can't call find_vm_area() in interrupt context, and
> +  * free_thread_stack can be called in interrupt context, so cache
> +  * the vm_struct.
> +  */
> + if (stack)
> + tsk->stack_vm_area = find_vm_area(stack);

This is annoying, we end up having to walk the vm_area tree twice (once
for the allocation, then here to get a handle on area).

Perhaps it's time the vmalloc code learned an allocation API that
returned the vm_area handle as well.

  Josh


signature.asc
Description: PGP signature


Re: [PATCH v6 1/3] fork: Add generic vmalloced stack support

2016-08-24 Thread Dmitry Vyukov
On Wed, Aug 24, 2016 at 3:03 AM, Ingo Molnar  wrote:
>
> * Andy Lutomirski  wrote:
>
>> +config VMAP_STACK
>> + default y
>> + bool "Use a virtually-mapped stack"
>> + depends on HAVE_ARCH_VMAP_STACK && !KASAN
>> + ---help---
>> +   Enable this if you want the use virtually-mapped kernel stacks
>> +   with guard pages.  This causes kernel stack overflows to be
>> +   caught immediately rather than causing difficult-to-diagnose
>> +   corruption.
>> +
>> +   This is presently incompatible with KASAN because KASAN expects
>> +   the stack to map directly to the KASAN shadow map using a formula
>> +   that is incorrect if the stack is in vmalloc space.
>
> Btw., is this KASAN limitation fundamental?
>
> As x86 is going to enable this feature by default, this probably limits KASAN
> utility rather significantly.


No, it's not fundamental.

KASAN has shadow for vmalloc range, but currently we map a single
read-only zero page there (which means "this memory is good"). Stack
instrumentation tries to write to that read-only page, which causes
crash.

Andrey proposed that we can map some real writable pages in the shadow
range when we allocate a vmalloc-ed stack:
https://groups.google.com/d/msg/kasan-dev/0YxqFs9r0V8/OKoGHQL8BAAJ


Re: [PATCH v6 1/3] fork: Add generic vmalloced stack support

2016-08-24 Thread Ingo Molnar

* Andy Lutomirski  wrote:

> +config VMAP_STACK
> + default y
> + bool "Use a virtually-mapped stack"
> + depends on HAVE_ARCH_VMAP_STACK && !KASAN
> + ---help---
> +   Enable this if you want the use virtually-mapped kernel stacks
> +   with guard pages.  This causes kernel stack overflows to be
> +   caught immediately rather than causing difficult-to-diagnose
> +   corruption.
> +
> +   This is presently incompatible with KASAN because KASAN expects
> +   the stack to map directly to the KASAN shadow map using a formula
> +   that is incorrect if the stack is in vmalloc space.

Btw., is this KASAN limitation fundamental?

As x86 is going to enable this feature by default, this probably limits KASAN 
utility rather significantly.

Thanks,

Ingo


Re: [PATCH v6 1/3] fork: Add generic vmalloced stack support

2016-08-15 Thread Michal Hocko
On Thu 11-08-16 02:35:21, Andy Lutomirski wrote:
> If CONFIG_VMAP_STACK is selected, kernel stacks are allocated with
> vmalloc_node.
> 
> grsecurity has had a similar feature (called
> GRKERNSEC_KSTACKOVERFLOW) for a long time.
> 
> Cc: Oleg Nesterov 
> Signed-off-by: Andy Lutomirski 

Looks good to me.
FWIW
Acked-by: Michal Hocko 

> ---
>  arch/Kconfig| 34 +
>  arch/ia64/include/asm/thread_info.h |  2 +-
>  include/linux/sched.h   | 15 ++
>  kernel/fork.c   | 96 
> +
>  4 files changed, 126 insertions(+), 21 deletions(-)
> 
> diff --git a/arch/Kconfig b/arch/Kconfig
> index bd8056b5b246..2a7f5b62e716 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -698,4 +698,38 @@ config ARCH_NO_COHERENT_DMA_MMAP
>  config CPU_NO_EFFICIENT_FFS
>   def_bool n
>  
> +config HAVE_ARCH_VMAP_STACK
> + def_bool n
> + help
> +   An arch should select this symbol if it can support kernel stacks
> +   in vmalloc space.  This means:
> +
> +   - vmalloc space must be large enough to hold many kernel stacks.
> + This may rule out many 32-bit architectures.
> +
> +   - Stacks in vmalloc space need to work reliably.  For example, if
> + vmap page tables are created on demand, either this mechanism
> + needs to work while the stack points to a virtual address with
> + unpopulated page tables or arch code (switch_to and switch_mm,
> + most likely) needs to ensure that the stack's page table entries
> + are populated before running on a possibly unpopulated stack.
> +
> +   - If the stack overflows into a guard page, something reasonable
> + should happen.  The definition of "reasonable" is flexible, but
> + instantly rebooting without logging anything would be unfriendly.
> +
> +config VMAP_STACK
> + default y
> + bool "Use a virtually-mapped stack"
> + depends on HAVE_ARCH_VMAP_STACK && !KASAN
> + ---help---
> +   Enable this if you want the use virtually-mapped kernel stacks
> +   with guard pages.  This causes kernel stack overflows to be
> +   caught immediately rather than causing difficult-to-diagnose
> +   corruption.
> +
> +   This is presently incompatible with KASAN because KASAN expects
> +   the stack to map directly to the KASAN shadow map using a formula
> +   that is incorrect if the stack is in vmalloc space.
> +
>  source "kernel/gcov/Kconfig"
> diff --git a/arch/ia64/include/asm/thread_info.h 
> b/arch/ia64/include/asm/thread_info.h
> index 29bd59790d6c..c7026429816b 100644
> --- a/arch/ia64/include/asm/thread_info.h
> +++ b/arch/ia64/include/asm/thread_info.h
> @@ -56,7 +56,7 @@ struct thread_info {
>  #define alloc_thread_stack_node(tsk, node)   ((unsigned long *) 0)
>  #define task_thread_info(tsk)((struct thread_info *) 0)
>  #endif
> -#define free_thread_stack(ti)/* nothing */
> +#define free_thread_stack(tsk)   /* nothing */
>  #define task_stack_page(tsk) ((void *)(tsk))
>  
>  #define __HAVE_THREAD_FUNCTIONS
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 62c68e513e39..20f9f47bcfd0 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1923,6 +1923,9 @@ struct task_struct {
>  #ifdef CONFIG_MMU
>   struct task_struct *oom_reaper_list;
>  #endif
> +#ifdef CONFIG_VMAP_STACK
> + struct vm_struct *stack_vm_area;
> +#endif
>  /* CPU-specific state of this task */
>   struct thread_struct thread;
>  /*
> @@ -1939,6 +1942,18 @@ extern int arch_task_struct_size __read_mostly;
>  # define arch_task_struct_size (sizeof(struct task_struct))
>  #endif
>  
> +#ifdef CONFIG_VMAP_STACK
> +static inline struct vm_struct *task_stack_vm_area(const struct task_struct 
> *t)
> +{
> + return t->stack_vm_area;
> +}
> +#else
> +static inline struct vm_struct *task_stack_vm_area(const struct task_struct 
> *t)
> +{
> + return NULL;
> +}
> +#endif
> +
>  /* Future-safe accessor for struct task_struct's cpus_allowed. */
>  #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
>  
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 52e725d4a866..05f7ef796fb4 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -158,19 +158,39 @@ void __weak arch_release_thread_stack(unsigned long 
> *stack)
>   * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
>   * kmemcache based allocator.
>   */
> -# if THREAD_SIZE >= PAGE_SIZE
> -static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
> -   int node)
> +# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
> +static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int 
> node)
>  {
> +#ifdef CONFIG_VMAP_STACK
> + void *stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE,
> +VMALLOC_START, VMALLOC_END,
> +