Re: [RFC PATCH 1/2] mm: rework memcg kernel stack accounting

2018-08-21 Thread Roman Gushchin
On Wed, Aug 15, 2018 at 10:37:28AM -0700, Andy Lutomirski wrote:
> 
> 
> > On Aug 15, 2018, at 10:32 AM, Shakeel Butt  wrote:
> > 
> >> On Wed, Aug 15, 2018 at 10:26 AM Roman Gushchin  wrote:
> >> 
> >>> On Wed, Aug 15, 2018 at 10:12:42AM -0700, Andy Lutomirski wrote:
> >>> 
> >>> 
> > On Aug 15, 2018, at 9:55 AM, Roman Gushchin  wrote:
> > 
> >> On Wed, Aug 15, 2018 at 12:39:23PM -0400, Johannes Weiner wrote:
> >> On Tue, Aug 14, 2018 at 05:36:19PM -0700, Roman Gushchin wrote:
> >> @@ -224,9 +224,14 @@ static unsigned long 
> >> *alloc_thread_stack_node(struct task_struct *tsk, int node)
> >>   return s->addr;
> >>   }
> >> 
> >> +/*
> >> + * Allocated stacks are cached and later reused by new threads,
> >> + * so memcg accounting is performed manually on 
> >> assigning/releasing
> >> + * stacks to tasks. Drop __GFP_ACCOUNT.
> >> + */
> >>   stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
> >>VMALLOC_START, VMALLOC_END,
> >> - THREADINFO_GFP,
> >> + THREADINFO_GFP & ~__GFP_ACCOUNT,
> >>PAGE_KERNEL,
> >>0, node, __builtin_return_address(0));
> >> 
> >> @@ -246,12 +251,41 @@ static unsigned long 
> >> *alloc_thread_stack_node(struct task_struct *tsk, int node)
> >> #endif
> >> }
> >> 
> >> +static void memcg_charge_kernel_stack(struct task_struct *tsk)
> >> +{
> >> +#ifdef CONFIG_VMAP_STACK
> >> +struct vm_struct *vm = task_stack_vm_area(tsk);
> >> +
> >> +if (vm) {
> >> +int i;
> >> +
> >> +for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
> >> +memcg_kmem_charge(vm->pages[i], __GFP_NOFAIL,
> >> +  compound_order(vm->pages[i]));
> >> +
> >> +/* All stack pages belong to the same memcg. */
> >> +mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
> >> + THREAD_SIZE / 1024);
> >> +}
> >> +#endif
> >> +}
> > 
> > Before this change, the memory limit can fail the fork, but afterwards
> > fork() can grow memory consumption unimpeded by the cgroup settings.
> > 
> > Can we continue to use try_charge() here and fail the fork?
>  
>  We can, but I'm not convinced we should.
>  
>  Kernel stack is relatively small, and it's already allocated at this 
>  point.
>  So IMO exceeding the memcg limit for 1-2 pages isn't worse than
>  adding complexity and handle this case (e.g. uncharge partially
>  charged stack). Do you have an example, when it does matter?
> >>> 
> >>> What bounds it to just a few pages?  Couldn’t there be lots of forks in 
> >>> flight that all hit this path?  It’s unlikely, and there are surely 
> >>> easier DoS vectors, but still.
> >> 
> >> Because any following memcg-aware allocation will fail.
> >> There is also the pid cgroup controlled which can be used to limit the 
> >> number
> >> of forks.
> >> 
> >> Anyway, I'm ok to handle the this case and fail fork,
> >> if you think it does matter.
> > 
> > Roman, before adding more changes do benchmark this. Maybe disabling
> > the stack caching for CONFIG_MEMCG is much cleaner.
> > 
> > 
> 
> Unless memcg accounting is colossally slow, the caching should be left on. 
> vmalloc() isn’t inherently slow, but vfree() is, since we need to do a global 
> broadcast TLB flush after enough vfree() calls.

It's not.

BTW, is the test, which you used to measure the performance
gains of stack caching, available publicly?

Thanks!


Re: [RFC PATCH 1/2] mm: rework memcg kernel stack accounting

2018-08-21 Thread Roman Gushchin
On Wed, Aug 15, 2018 at 10:37:28AM -0700, Andy Lutomirski wrote:
> 
> 
> > On Aug 15, 2018, at 10:32 AM, Shakeel Butt  wrote:
> > 
> >> On Wed, Aug 15, 2018 at 10:26 AM Roman Gushchin  wrote:
> >> 
> >>> On Wed, Aug 15, 2018 at 10:12:42AM -0700, Andy Lutomirski wrote:
> >>> 
> >>> 
> > On Aug 15, 2018, at 9:55 AM, Roman Gushchin  wrote:
> > 
> >> On Wed, Aug 15, 2018 at 12:39:23PM -0400, Johannes Weiner wrote:
> >> On Tue, Aug 14, 2018 at 05:36:19PM -0700, Roman Gushchin wrote:
> >> @@ -224,9 +224,14 @@ static unsigned long 
> >> *alloc_thread_stack_node(struct task_struct *tsk, int node)
> >>   return s->addr;
> >>   }
> >> 
> >> +/*
> >> + * Allocated stacks are cached and later reused by new threads,
> >> + * so memcg accounting is performed manually on 
> >> assigning/releasing
> >> + * stacks to tasks. Drop __GFP_ACCOUNT.
> >> + */
> >>   stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
> >>VMALLOC_START, VMALLOC_END,
> >> - THREADINFO_GFP,
> >> + THREADINFO_GFP & ~__GFP_ACCOUNT,
> >>PAGE_KERNEL,
> >>0, node, __builtin_return_address(0));
> >> 
> >> @@ -246,12 +251,41 @@ static unsigned long 
> >> *alloc_thread_stack_node(struct task_struct *tsk, int node)
> >> #endif
> >> }
> >> 
> >> +static void memcg_charge_kernel_stack(struct task_struct *tsk)
> >> +{
> >> +#ifdef CONFIG_VMAP_STACK
> >> +struct vm_struct *vm = task_stack_vm_area(tsk);
> >> +
> >> +if (vm) {
> >> +int i;
> >> +
> >> +for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
> >> +memcg_kmem_charge(vm->pages[i], __GFP_NOFAIL,
> >> +  compound_order(vm->pages[i]));
> >> +
> >> +/* All stack pages belong to the same memcg. */
> >> +mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
> >> + THREAD_SIZE / 1024);
> >> +}
> >> +#endif
> >> +}
> > 
> > Before this change, the memory limit can fail the fork, but afterwards
> > fork() can grow memory consumption unimpeded by the cgroup settings.
> > 
> > Can we continue to use try_charge() here and fail the fork?
>  
>  We can, but I'm not convinced we should.
>  
>  Kernel stack is relatively small, and it's already allocated at this 
>  point.
>  So IMO exceeding the memcg limit for 1-2 pages isn't worse than
>  adding complexity and handle this case (e.g. uncharge partially
>  charged stack). Do you have an example, when it does matter?
> >>> 
> >>> What bounds it to just a few pages?  Couldn’t there be lots of forks in 
> >>> flight that all hit this path?  It’s unlikely, and there are surely 
> >>> easier DoS vectors, but still.
> >> 
> >> Because any following memcg-aware allocation will fail.
> >> There is also the pid cgroup controlled which can be used to limit the 
> >> number
> >> of forks.
> >> 
> >> Anyway, I'm ok to handle the this case and fail fork,
> >> if you think it does matter.
> > 
> > Roman, before adding more changes do benchmark this. Maybe disabling
> > the stack caching for CONFIG_MEMCG is much cleaner.
> > 
> > 
> 
> Unless memcg accounting is colossally slow, the caching should be left on. 
> vmalloc() isn’t inherently slow, but vfree() is, since we need to do a global 
> broadcast TLB flush after enough vfree() calls.

It's not.

BTW, is the test, which you used to measure the performance
gains of stack caching, available publicly?

Thanks!


Re: [RFC PATCH 1/2] mm: rework memcg kernel stack accounting

2018-08-16 Thread Roman Gushchin
On Thu, Aug 16, 2018 at 08:35:09AM +0200, Michal Hocko wrote:
> On Wed 15-08-18 13:20:44, Johannes Weiner wrote:
> [...]
> > This is completely backwards.
> > 
> > We respect the limits unless there is a *really* strong reason not
> > to. The only situations I can think of is during OOM kills to avoid
> > memory deadlocks and during packet reception for correctness issues
> > (and because the network stack has its own way to reclaim memory).
> > 
> > Relying on some vague future allocations in the process's lifetime to
> > fail in order to contain it is crappy and unreliable. And unwinding
> > the stack allocation isn't too much complexity to warrant breaking the
> > containment rules here, even if it were several steps. But it looks
> > like it's nothing more than a 'goto free_stack'.
> > 
> > Please just fix this.
> 
> Thinking about it some more (sorry I should have done that in my
> previous reply already) I do agree with Johannes. We should really back
> off as soon as possible rather than rely on a future action because
> this is quite subtle and prone to unexpected behavior.

Ok, no problems, I'll address this in v2.

Thanks!


Re: [RFC PATCH 1/2] mm: rework memcg kernel stack accounting

2018-08-16 Thread Roman Gushchin
On Thu, Aug 16, 2018 at 08:35:09AM +0200, Michal Hocko wrote:
> On Wed 15-08-18 13:20:44, Johannes Weiner wrote:
> [...]
> > This is completely backwards.
> > 
> > We respect the limits unless there is a *really* strong reason not
> > to. The only situations I can think of is during OOM kills to avoid
> > memory deadlocks and during packet reception for correctness issues
> > (and because the network stack has its own way to reclaim memory).
> > 
> > Relying on some vague future allocations in the process's lifetime to
> > fail in order to contain it is crappy and unreliable. And unwinding
> > the stack allocation isn't too much complexity to warrant breaking the
> > containment rules here, even if it were several steps. But it looks
> > like it's nothing more than a 'goto free_stack'.
> > 
> > Please just fix this.
> 
> Thinking about it some more (sorry I should have done that in my
> previous reply already) I do agree with Johannes. We should really back
> off as soon as possible rather than rely on a future action because
> this is quite subtle and prone to unexpected behavior.

Ok, no problems, I'll address this in v2.

Thanks!


Re: [RFC PATCH 1/2] mm: rework memcg kernel stack accounting

2018-08-16 Thread Michal Hocko
On Wed 15-08-18 13:20:44, Johannes Weiner wrote:
[...]
> This is completely backwards.
> 
> We respect the limits unless there is a *really* strong reason not
> to. The only situations I can think of is during OOM kills to avoid
> memory deadlocks and during packet reception for correctness issues
> (and because the network stack has its own way to reclaim memory).
> 
> Relying on some vague future allocations in the process's lifetime to
> fail in order to contain it is crappy and unreliable. And unwinding
> the stack allocation isn't too much complexity to warrant breaking the
> containment rules here, even if it were several steps. But it looks
> like it's nothing more than a 'goto free_stack'.
> 
> Please just fix this.

Thinking about it some more (sorry I should have done that in my
previous reply already) I do agree with Johannes. We should really back
off as soon as possible rather than rely on a future action because
this is quite subtle and prone to unexpected behavior.

-- 
Michal Hocko
SUSE Labs


Re: [RFC PATCH 1/2] mm: rework memcg kernel stack accounting

2018-08-16 Thread Michal Hocko
On Wed 15-08-18 13:20:44, Johannes Weiner wrote:
[...]
> This is completely backwards.
> 
> We respect the limits unless there is a *really* strong reason not
> to. The only situations I can think of is during OOM kills to avoid
> memory deadlocks and during packet reception for correctness issues
> (and because the network stack has its own way to reclaim memory).
> 
> Relying on some vague future allocations in the process's lifetime to
> fail in order to contain it is crappy and unreliable. And unwinding
> the stack allocation isn't too much complexity to warrant breaking the
> containment rules here, even if it were several steps. But it looks
> like it's nothing more than a 'goto free_stack'.
> 
> Please just fix this.

Thinking about it some more (sorry I should have done that in my
previous reply already) I do agree with Johannes. We should really back
off as soon as possible rather than rely on a future action because
this is quite subtle and prone to unexpected behavior.

-- 
Michal Hocko
SUSE Labs


Re: [RFC PATCH 1/2] mm: rework memcg kernel stack accounting

2018-08-15 Thread Andy Lutomirski



> On Aug 15, 2018, at 10:32 AM, Shakeel Butt  wrote:
> 
>> On Wed, Aug 15, 2018 at 10:26 AM Roman Gushchin  wrote:
>> 
>>> On Wed, Aug 15, 2018 at 10:12:42AM -0700, Andy Lutomirski wrote:
>>> 
>>> 
> On Aug 15, 2018, at 9:55 AM, Roman Gushchin  wrote:
> 
>> On Wed, Aug 15, 2018 at 12:39:23PM -0400, Johannes Weiner wrote:
>> On Tue, Aug 14, 2018 at 05:36:19PM -0700, Roman Gushchin wrote:
>> @@ -224,9 +224,14 @@ static unsigned long 
>> *alloc_thread_stack_node(struct task_struct *tsk, int node)
>>   return s->addr;
>>   }
>> 
>> +/*
>> + * Allocated stacks are cached and later reused by new threads,
>> + * so memcg accounting is performed manually on assigning/releasing
>> + * stacks to tasks. Drop __GFP_ACCOUNT.
>> + */
>>   stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
>>VMALLOC_START, VMALLOC_END,
>> - THREADINFO_GFP,
>> + THREADINFO_GFP & ~__GFP_ACCOUNT,
>>PAGE_KERNEL,
>>0, node, __builtin_return_address(0));
>> 
>> @@ -246,12 +251,41 @@ static unsigned long 
>> *alloc_thread_stack_node(struct task_struct *tsk, int node)
>> #endif
>> }
>> 
>> +static void memcg_charge_kernel_stack(struct task_struct *tsk)
>> +{
>> +#ifdef CONFIG_VMAP_STACK
>> +struct vm_struct *vm = task_stack_vm_area(tsk);
>> +
>> +if (vm) {
>> +int i;
>> +
>> +for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
>> +memcg_kmem_charge(vm->pages[i], __GFP_NOFAIL,
>> +  compound_order(vm->pages[i]));
>> +
>> +/* All stack pages belong to the same memcg. */
>> +mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
>> + THREAD_SIZE / 1024);
>> +}
>> +#endif
>> +}
> 
> Before this change, the memory limit can fail the fork, but afterwards
> fork() can grow memory consumption unimpeded by the cgroup settings.
> 
> Can we continue to use try_charge() here and fail the fork?
 
 We can, but I'm not convinced we should.
 
 Kernel stack is relatively small, and it's already allocated at this point.
 So IMO exceeding the memcg limit for 1-2 pages isn't worse than
 adding complexity and handle this case (e.g. uncharge partially
 charged stack). Do you have an example, when it does matter?
>>> 
>>> What bounds it to just a few pages?  Couldn’t there be lots of forks in 
>>> flight that all hit this path?  It’s unlikely, and there are surely easier 
>>> DoS vectors, but still.
>> 
>> Because any following memcg-aware allocation will fail.
>> There is also the pid cgroup controlled which can be used to limit the number
>> of forks.
>> 
>> Anyway, I'm ok to handle the this case and fail fork,
>> if you think it does matter.
> 
> Roman, before adding more changes do benchmark this. Maybe disabling
> the stack caching for CONFIG_MEMCG is much cleaner.
> 
> 

Unless memcg accounting is colossally slow, the caching should be left on. 
vmalloc() isn’t inherently slow, but vfree() is, since we need to do a global 
broadcast TLB flush after enough vfree() calls.

Re: [RFC PATCH 1/2] mm: rework memcg kernel stack accounting

2018-08-15 Thread Andy Lutomirski



> On Aug 15, 2018, at 10:32 AM, Shakeel Butt  wrote:
> 
>> On Wed, Aug 15, 2018 at 10:26 AM Roman Gushchin  wrote:
>> 
>>> On Wed, Aug 15, 2018 at 10:12:42AM -0700, Andy Lutomirski wrote:
>>> 
>>> 
> On Aug 15, 2018, at 9:55 AM, Roman Gushchin  wrote:
> 
>> On Wed, Aug 15, 2018 at 12:39:23PM -0400, Johannes Weiner wrote:
>> On Tue, Aug 14, 2018 at 05:36:19PM -0700, Roman Gushchin wrote:
>> @@ -224,9 +224,14 @@ static unsigned long 
>> *alloc_thread_stack_node(struct task_struct *tsk, int node)
>>   return s->addr;
>>   }
>> 
>> +/*
>> + * Allocated stacks are cached and later reused by new threads,
>> + * so memcg accounting is performed manually on assigning/releasing
>> + * stacks to tasks. Drop __GFP_ACCOUNT.
>> + */
>>   stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
>>VMALLOC_START, VMALLOC_END,
>> - THREADINFO_GFP,
>> + THREADINFO_GFP & ~__GFP_ACCOUNT,
>>PAGE_KERNEL,
>>0, node, __builtin_return_address(0));
>> 
>> @@ -246,12 +251,41 @@ static unsigned long 
>> *alloc_thread_stack_node(struct task_struct *tsk, int node)
>> #endif
>> }
>> 
>> +static void memcg_charge_kernel_stack(struct task_struct *tsk)
>> +{
>> +#ifdef CONFIG_VMAP_STACK
>> +struct vm_struct *vm = task_stack_vm_area(tsk);
>> +
>> +if (vm) {
>> +int i;
>> +
>> +for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
>> +memcg_kmem_charge(vm->pages[i], __GFP_NOFAIL,
>> +  compound_order(vm->pages[i]));
>> +
>> +/* All stack pages belong to the same memcg. */
>> +mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
>> + THREAD_SIZE / 1024);
>> +}
>> +#endif
>> +}
> 
> Before this change, the memory limit can fail the fork, but afterwards
> fork() can grow memory consumption unimpeded by the cgroup settings.
> 
> Can we continue to use try_charge() here and fail the fork?
 
 We can, but I'm not convinced we should.
 
 Kernel stack is relatively small, and it's already allocated at this point.
 So IMO exceeding the memcg limit for 1-2 pages isn't worse than
 adding complexity and handle this case (e.g. uncharge partially
 charged stack). Do you have an example, when it does matter?
>>> 
>>> What bounds it to just a few pages?  Couldn’t there be lots of forks in 
>>> flight that all hit this path?  It’s unlikely, and there are surely easier 
>>> DoS vectors, but still.
>> 
>> Because any following memcg-aware allocation will fail.
>> There is also the pid cgroup controlled which can be used to limit the number
>> of forks.
>> 
>> Anyway, I'm ok to handle the this case and fail fork,
>> if you think it does matter.
> 
> Roman, before adding more changes do benchmark this. Maybe disabling
> the stack caching for CONFIG_MEMCG is much cleaner.
> 
> 

Unless memcg accounting is colossally slow, the caching should be left on. 
vmalloc() isn’t inherently slow, but vfree() is, since we need to do a global 
broadcast TLB flush after enough vfree() calls.

Re: [RFC PATCH 1/2] mm: rework memcg kernel stack accounting

2018-08-15 Thread Shakeel Butt
On Wed, Aug 15, 2018 at 10:26 AM Roman Gushchin  wrote:
>
> On Wed, Aug 15, 2018 at 10:12:42AM -0700, Andy Lutomirski wrote:
> >
> >
> > > On Aug 15, 2018, at 9:55 AM, Roman Gushchin  wrote:
> > >
> > >> On Wed, Aug 15, 2018 at 12:39:23PM -0400, Johannes Weiner wrote:
> > >>> On Tue, Aug 14, 2018 at 05:36:19PM -0700, Roman Gushchin wrote:
> > >>> @@ -224,9 +224,14 @@ static unsigned long 
> > >>> *alloc_thread_stack_node(struct task_struct *tsk, int node)
> > >>>return s->addr;
> > >>>}
> > >>>
> > >>> +/*
> > >>> + * Allocated stacks are cached and later reused by new threads,
> > >>> + * so memcg accounting is performed manually on assigning/releasing
> > >>> + * stacks to tasks. Drop __GFP_ACCOUNT.
> > >>> + */
> > >>>stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
> > >>> VMALLOC_START, VMALLOC_END,
> > >>> - THREADINFO_GFP,
> > >>> + THREADINFO_GFP & ~__GFP_ACCOUNT,
> > >>> PAGE_KERNEL,
> > >>> 0, node, __builtin_return_address(0));
> > >>>
> > >>> @@ -246,12 +251,41 @@ static unsigned long 
> > >>> *alloc_thread_stack_node(struct task_struct *tsk, int node)
> > >>> #endif
> > >>> }
> > >>>
> > >>> +static void memcg_charge_kernel_stack(struct task_struct *tsk)
> > >>> +{
> > >>> +#ifdef CONFIG_VMAP_STACK
> > >>> +struct vm_struct *vm = task_stack_vm_area(tsk);
> > >>> +
> > >>> +if (vm) {
> > >>> +int i;
> > >>> +
> > >>> +for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
> > >>> +memcg_kmem_charge(vm->pages[i], __GFP_NOFAIL,
> > >>> +  compound_order(vm->pages[i]));
> > >>> +
> > >>> +/* All stack pages belong to the same memcg. */
> > >>> +mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
> > >>> + THREAD_SIZE / 1024);
> > >>> +}
> > >>> +#endif
> > >>> +}
> > >>
> > >> Before this change, the memory limit can fail the fork, but afterwards
> > >> fork() can grow memory consumption unimpeded by the cgroup settings.
> > >>
> > >> Can we continue to use try_charge() here and fail the fork?
> > >
> > > We can, but I'm not convinced we should.
> > >
> > > Kernel stack is relatively small, and it's already allocated at this 
> > > point.
> > > So IMO exceeding the memcg limit for 1-2 pages isn't worse than
> > > adding complexity and handle this case (e.g. uncharge partially
> > > charged stack). Do you have an example, when it does matter?
> >
> > What bounds it to just a few pages?  Couldn’t there be lots of forks in 
> > flight that all hit this path?  It’s unlikely, and there are surely easier 
> > DoS vectors, but still.
>
> Because any following memcg-aware allocation will fail.
> There is also the pid cgroup controlled which can be used to limit the number
> of forks.
>
> Anyway, I'm ok to handle the this case and fail fork,
> if you think it does matter.

Roman, before adding more changes do benchmark this. Maybe disabling
the stack caching for CONFIG_MEMCG is much cleaner.

Shakeel


Re: [RFC PATCH 1/2] mm: rework memcg kernel stack accounting

2018-08-15 Thread Shakeel Butt
On Wed, Aug 15, 2018 at 10:26 AM Roman Gushchin  wrote:
>
> On Wed, Aug 15, 2018 at 10:12:42AM -0700, Andy Lutomirski wrote:
> >
> >
> > > On Aug 15, 2018, at 9:55 AM, Roman Gushchin  wrote:
> > >
> > >> On Wed, Aug 15, 2018 at 12:39:23PM -0400, Johannes Weiner wrote:
> > >>> On Tue, Aug 14, 2018 at 05:36:19PM -0700, Roman Gushchin wrote:
> > >>> @@ -224,9 +224,14 @@ static unsigned long 
> > >>> *alloc_thread_stack_node(struct task_struct *tsk, int node)
> > >>>return s->addr;
> > >>>}
> > >>>
> > >>> +/*
> > >>> + * Allocated stacks are cached and later reused by new threads,
> > >>> + * so memcg accounting is performed manually on assigning/releasing
> > >>> + * stacks to tasks. Drop __GFP_ACCOUNT.
> > >>> + */
> > >>>stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
> > >>> VMALLOC_START, VMALLOC_END,
> > >>> - THREADINFO_GFP,
> > >>> + THREADINFO_GFP & ~__GFP_ACCOUNT,
> > >>> PAGE_KERNEL,
> > >>> 0, node, __builtin_return_address(0));
> > >>>
> > >>> @@ -246,12 +251,41 @@ static unsigned long 
> > >>> *alloc_thread_stack_node(struct task_struct *tsk, int node)
> > >>> #endif
> > >>> }
> > >>>
> > >>> +static void memcg_charge_kernel_stack(struct task_struct *tsk)
> > >>> +{
> > >>> +#ifdef CONFIG_VMAP_STACK
> > >>> +struct vm_struct *vm = task_stack_vm_area(tsk);
> > >>> +
> > >>> +if (vm) {
> > >>> +int i;
> > >>> +
> > >>> +for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
> > >>> +memcg_kmem_charge(vm->pages[i], __GFP_NOFAIL,
> > >>> +  compound_order(vm->pages[i]));
> > >>> +
> > >>> +/* All stack pages belong to the same memcg. */
> > >>> +mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
> > >>> + THREAD_SIZE / 1024);
> > >>> +}
> > >>> +#endif
> > >>> +}
> > >>
> > >> Before this change, the memory limit can fail the fork, but afterwards
> > >> fork() can grow memory consumption unimpeded by the cgroup settings.
> > >>
> > >> Can we continue to use try_charge() here and fail the fork?
> > >
> > > We can, but I'm not convinced we should.
> > >
> > > Kernel stack is relatively small, and it's already allocated at this 
> > > point.
> > > So IMO exceeding the memcg limit for 1-2 pages isn't worse than
> > > adding complexity and handle this case (e.g. uncharge partially
> > > charged stack). Do you have an example, when it does matter?
> >
> > What bounds it to just a few pages?  Couldn’t there be lots of forks in 
> > flight that all hit this path?  It’s unlikely, and there are surely easier 
> > DoS vectors, but still.
>
> Because any following memcg-aware allocation will fail.
> There is also the pid cgroup controlled which can be used to limit the number
> of forks.
>
> Anyway, I'm ok to handle the this case and fail fork,
> if you think it does matter.

Roman, before adding more changes do benchmark this. Maybe disabling
the stack caching for CONFIG_MEMCG is much cleaner.

Shakeel


Re: [RFC PATCH 1/2] mm: rework memcg kernel stack accounting

2018-08-15 Thread Roman Gushchin
On Wed, Aug 15, 2018 at 10:12:42AM -0700, Andy Lutomirski wrote:
> 
> 
> > On Aug 15, 2018, at 9:55 AM, Roman Gushchin  wrote:
> > 
> >> On Wed, Aug 15, 2018 at 12:39:23PM -0400, Johannes Weiner wrote:
> >>> On Tue, Aug 14, 2018 at 05:36:19PM -0700, Roman Gushchin wrote:
> >>> @@ -224,9 +224,14 @@ static unsigned long *alloc_thread_stack_node(struct 
> >>> task_struct *tsk, int node)
> >>>return s->addr;
> >>>}
> >>> 
> >>> +/*
> >>> + * Allocated stacks are cached and later reused by new threads,
> >>> + * so memcg accounting is performed manually on assigning/releasing
> >>> + * stacks to tasks. Drop __GFP_ACCOUNT.
> >>> + */
> >>>stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
> >>> VMALLOC_START, VMALLOC_END,
> >>> - THREADINFO_GFP,
> >>> + THREADINFO_GFP & ~__GFP_ACCOUNT,
> >>> PAGE_KERNEL,
> >>> 0, node, __builtin_return_address(0));
> >>> 
> >>> @@ -246,12 +251,41 @@ static unsigned long 
> >>> *alloc_thread_stack_node(struct task_struct *tsk, int node)
> >>> #endif
> >>> }
> >>> 
> >>> +static void memcg_charge_kernel_stack(struct task_struct *tsk)
> >>> +{
> >>> +#ifdef CONFIG_VMAP_STACK
> >>> +struct vm_struct *vm = task_stack_vm_area(tsk);
> >>> +
> >>> +if (vm) {
> >>> +int i;
> >>> +
> >>> +for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
> >>> +memcg_kmem_charge(vm->pages[i], __GFP_NOFAIL,
> >>> +  compound_order(vm->pages[i]));
> >>> +
> >>> +/* All stack pages belong to the same memcg. */
> >>> +mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
> >>> + THREAD_SIZE / 1024);
> >>> +}
> >>> +#endif
> >>> +}
> >> 
> >> Before this change, the memory limit can fail the fork, but afterwards
> >> fork() can grow memory consumption unimpeded by the cgroup settings.
> >> 
> >> Can we continue to use try_charge() here and fail the fork?
> > 
> > We can, but I'm not convinced we should.
> > 
> > Kernel stack is relatively small, and it's already allocated at this point.
> > So IMO exceeding the memcg limit for 1-2 pages isn't worse than
> > adding complexity and handle this case (e.g. uncharge partially
> > charged stack). Do you have an example, when it does matter?
> 
> What bounds it to just a few pages?  Couldn’t there be lots of forks in 
> flight that all hit this path?  It’s unlikely, and there are surely easier 
> DoS vectors, but still.

Because any following memcg-aware allocation will fail.
There is also the pid cgroup controlled which can be used to limit the number
of forks.

Anyway, I'm ok to handle the this case and fail fork,
if you think it does matter.


Re: [RFC PATCH 1/2] mm: rework memcg kernel stack accounting

2018-08-15 Thread Roman Gushchin
On Wed, Aug 15, 2018 at 10:12:42AM -0700, Andy Lutomirski wrote:
> 
> 
> > On Aug 15, 2018, at 9:55 AM, Roman Gushchin  wrote:
> > 
> >> On Wed, Aug 15, 2018 at 12:39:23PM -0400, Johannes Weiner wrote:
> >>> On Tue, Aug 14, 2018 at 05:36:19PM -0700, Roman Gushchin wrote:
> >>> @@ -224,9 +224,14 @@ static unsigned long *alloc_thread_stack_node(struct 
> >>> task_struct *tsk, int node)
> >>>return s->addr;
> >>>}
> >>> 
> >>> +/*
> >>> + * Allocated stacks are cached and later reused by new threads,
> >>> + * so memcg accounting is performed manually on assigning/releasing
> >>> + * stacks to tasks. Drop __GFP_ACCOUNT.
> >>> + */
> >>>stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
> >>> VMALLOC_START, VMALLOC_END,
> >>> - THREADINFO_GFP,
> >>> + THREADINFO_GFP & ~__GFP_ACCOUNT,
> >>> PAGE_KERNEL,
> >>> 0, node, __builtin_return_address(0));
> >>> 
> >>> @@ -246,12 +251,41 @@ static unsigned long 
> >>> *alloc_thread_stack_node(struct task_struct *tsk, int node)
> >>> #endif
> >>> }
> >>> 
> >>> +static void memcg_charge_kernel_stack(struct task_struct *tsk)
> >>> +{
> >>> +#ifdef CONFIG_VMAP_STACK
> >>> +struct vm_struct *vm = task_stack_vm_area(tsk);
> >>> +
> >>> +if (vm) {
> >>> +int i;
> >>> +
> >>> +for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
> >>> +memcg_kmem_charge(vm->pages[i], __GFP_NOFAIL,
> >>> +  compound_order(vm->pages[i]));
> >>> +
> >>> +/* All stack pages belong to the same memcg. */
> >>> +mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
> >>> + THREAD_SIZE / 1024);
> >>> +}
> >>> +#endif
> >>> +}
> >> 
> >> Before this change, the memory limit can fail the fork, but afterwards
> >> fork() can grow memory consumption unimpeded by the cgroup settings.
> >> 
> >> Can we continue to use try_charge() here and fail the fork?
> > 
> > We can, but I'm not convinced we should.
> > 
> > Kernel stack is relatively small, and it's already allocated at this point.
> > So IMO exceeding the memcg limit for 1-2 pages isn't worse than
> > adding complexity and handle this case (e.g. uncharge partially
> > charged stack). Do you have an example, when it does matter?
> 
> What bounds it to just a few pages?  Couldn’t there be lots of forks in 
> flight that all hit this path?  It’s unlikely, and there are surely easier 
> DoS vectors, but still.

Because any following memcg-aware allocation will fail.
There is also the pid cgroup controlled which can be used to limit the number
of forks.

Anyway, I'm ok to handle the this case and fail fork,
if you think it does matter.


Re: [RFC PATCH 1/2] mm: rework memcg kernel stack accounting

2018-08-15 Thread Johannes Weiner
On Wed, Aug 15, 2018 at 09:55:17AM -0700, Roman Gushchin wrote:
> On Wed, Aug 15, 2018 at 12:39:23PM -0400, Johannes Weiner wrote:
> > On Tue, Aug 14, 2018 at 05:36:19PM -0700, Roman Gushchin wrote:
> > > @@ -224,9 +224,14 @@ static unsigned long *alloc_thread_stack_node(struct 
> > > task_struct *tsk, int node)
> > >   return s->addr;
> > >   }
> > >  
> > > + /*
> > > +  * Allocated stacks are cached and later reused by new threads,
> > > +  * so memcg accounting is performed manually on assigning/releasing
> > > +  * stacks to tasks. Drop __GFP_ACCOUNT.
> > > +  */
> > >   stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
> > >VMALLOC_START, VMALLOC_END,
> > > -  THREADINFO_GFP,
> > > +  THREADINFO_GFP & ~__GFP_ACCOUNT,
> > >PAGE_KERNEL,
> > >0, node, __builtin_return_address(0));
> > >  
> > > @@ -246,12 +251,41 @@ static unsigned long 
> > > *alloc_thread_stack_node(struct task_struct *tsk, int node)
> > >  #endif
> > >  }
> > >  
> > > +static void memcg_charge_kernel_stack(struct task_struct *tsk)
> > > +{
> > > +#ifdef CONFIG_VMAP_STACK
> > > + struct vm_struct *vm = task_stack_vm_area(tsk);
> > > +
> > > + if (vm) {
> > > + int i;
> > > +
> > > + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
> > > + memcg_kmem_charge(vm->pages[i], __GFP_NOFAIL,
> > > +   compound_order(vm->pages[i]));
> > > +
> > > + /* All stack pages belong to the same memcg. */
> > > + mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
> > > +  THREAD_SIZE / 1024);
> > > + }
> > > +#endif
> > > +}
> > 
> > Before this change, the memory limit can fail the fork, but afterwards
> > fork() can grow memory consumption unimpeded by the cgroup settings.
> > 
> > Can we continue to use try_charge() here and fail the fork?
> 
> We can, but I'm not convinced we should.
> 
> Kernel stack is relatively small, and it's already allocated at this point.
> So IMO exceeding the memcg limit for 1-2 pages isn't worse than
> adding complexity and handle this case (e.g. uncharge partially
> charged stack). Do you have an example, when it does matter?

This is completely backwards.

We respect the limits unless there is a *really* strong reason not
to. The only situations I can think of is during OOM kills to avoid
memory deadlocks and during packet reception for correctness issues
(and because the network stack has its own way to reclaim memory).

Relying on some vague future allocations in the process's lifetime to
fail in order to contain it is crappy and unreliable. And unwinding
the stack allocation isn't too much complexity to warrant breaking the
containment rules here, even if it were several steps. But it looks
like it's nothing more than a 'goto free_stack'.

Please just fix this.


Re: [RFC PATCH 1/2] mm: rework memcg kernel stack accounting

2018-08-15 Thread Johannes Weiner
On Wed, Aug 15, 2018 at 09:55:17AM -0700, Roman Gushchin wrote:
> On Wed, Aug 15, 2018 at 12:39:23PM -0400, Johannes Weiner wrote:
> > On Tue, Aug 14, 2018 at 05:36:19PM -0700, Roman Gushchin wrote:
> > > @@ -224,9 +224,14 @@ static unsigned long *alloc_thread_stack_node(struct 
> > > task_struct *tsk, int node)
> > >   return s->addr;
> > >   }
> > >  
> > > + /*
> > > +  * Allocated stacks are cached and later reused by new threads,
> > > +  * so memcg accounting is performed manually on assigning/releasing
> > > +  * stacks to tasks. Drop __GFP_ACCOUNT.
> > > +  */
> > >   stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
> > >VMALLOC_START, VMALLOC_END,
> > > -  THREADINFO_GFP,
> > > +  THREADINFO_GFP & ~__GFP_ACCOUNT,
> > >PAGE_KERNEL,
> > >0, node, __builtin_return_address(0));
> > >  
> > > @@ -246,12 +251,41 @@ static unsigned long 
> > > *alloc_thread_stack_node(struct task_struct *tsk, int node)
> > >  #endif
> > >  }
> > >  
> > > +static void memcg_charge_kernel_stack(struct task_struct *tsk)
> > > +{
> > > +#ifdef CONFIG_VMAP_STACK
> > > + struct vm_struct *vm = task_stack_vm_area(tsk);
> > > +
> > > + if (vm) {
> > > + int i;
> > > +
> > > + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
> > > + memcg_kmem_charge(vm->pages[i], __GFP_NOFAIL,
> > > +   compound_order(vm->pages[i]));
> > > +
> > > + /* All stack pages belong to the same memcg. */
> > > + mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
> > > +  THREAD_SIZE / 1024);
> > > + }
> > > +#endif
> > > +}
> > 
> > Before this change, the memory limit can fail the fork, but afterwards
> > fork() can grow memory consumption unimpeded by the cgroup settings.
> > 
> > Can we continue to use try_charge() here and fail the fork?
> 
> We can, but I'm not convinced we should.
> 
> Kernel stack is relatively small, and it's already allocated at this point.
> So IMO exceeding the memcg limit for 1-2 pages isn't worse than
> adding complexity and handle this case (e.g. uncharge partially
> charged stack). Do you have an example, when it does matter?

This is completely backwards.

We respect the limits unless there is a *really* strong reason not
to. The only situations I can think of is during OOM kills to avoid
memory deadlocks and during packet reception for correctness issues
(and because the network stack has its own way to reclaim memory).

Relying on some vague future allocations in the process's lifetime to
fail in order to contain it is crappy and unreliable. And unwinding
the stack allocation isn't too much complexity to warrant breaking the
containment rules here, even if it were several steps. But it looks
like it's nothing more than a 'goto free_stack'.

Please just fix this.


Re: [RFC PATCH 1/2] mm: rework memcg kernel stack accounting

2018-08-15 Thread Roman Gushchin
On Tue, Aug 14, 2018 at 06:18:01PM -0700, Shakeel Butt wrote:
> On Tue, Aug 14, 2018 at 5:37 PM Roman Gushchin  wrote:
> >
> > If CONFIG_VMAP_STACK is set, kernel stacks are allocated
> > using __vmalloc_node_range() with __GFP_ACCOUNT. So kernel
> > stack pages are charged against corresponding memory cgroups
> > on allocation and uncharged on releasing them.
> >
> > The problem is that we do cache kernel stacks in small
> > per-cpu caches and do reuse them for new tasks, which can
> > belong to different memory cgroups.
> >
> > Each stack page still holds a reference to the original cgroup,
> > so the cgroup can't be released until the vmap area is released.
> >
> > To make this happen we need more than two subsequent exits
> > without forks in between on the current cpu, which makes it
> > very unlikely to happen. As a result, I saw a significant number
> > of dying cgroups (in theory, up to 2 * number_of_cpu +
> > number_of_tasks), which can't be released even by significant
> > memory pressure.
> >
> > As a cgroup structure can take a significant amount of memory
> > (first of all, per-cpu data like memcg statistics), it leads
> > to a noticeable waste of memory.
> >
> > Signed-off-by: Roman Gushchin 
> 
> I was also looking into this issue. I was thinking of having a
> per-memcg per-cpu stack cache. However this solution seems much
> simpler.

I also thought about having per-memcg stack cache, but it seems
that caching 2 * n(cpus) * n(cgroups) stacks is an overkill,
and there is nothing memcg-specific in these stacks except
that they are pre-charged.

> Can you also add the performance number for a similar simple
> benchmark done in ac496bf48d97 ("fork: Optimize task creation by
> caching two thread stacks per CPU if CONFIG_VMAP_STACK=y").

Sure, will do in v2.

> 
> Reviewed-by: Shakeel Butt 

Thanks!

> 
> > Cc: Johannes Weiner 
> > Cc: Michal Hocko 
> > Cc: Andy Lutomirski 
> > Cc: Konstantin Khlebnikov 
> > Cc: Tejun Heo 
> > ---
> >  kernel/fork.c | 44 ++--
> >  1 file changed, 38 insertions(+), 6 deletions(-)
> >
> > diff --git a/kernel/fork.c b/kernel/fork.c
> > index 69b6fea5a181..91872b2b37bd 100644
> > --- a/kernel/fork.c
> > +++ b/kernel/fork.c
> > @@ -224,9 +224,14 @@ static unsigned long *alloc_thread_stack_node(struct 
> > task_struct *tsk, int node)
> > return s->addr;
> > }
> >
> > +   /*
> > +* Allocated stacks are cached and later reused by new threads,
> > +* so memcg accounting is performed manually on assigning/releasing
> > +* stacks to tasks. Drop __GFP_ACCOUNT.
> > +*/
> > stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
> >  VMALLOC_START, VMALLOC_END,
> > -THREADINFO_GFP,
> > +THREADINFO_GFP & ~__GFP_ACCOUNT,
> >  PAGE_KERNEL,
> >  0, node, __builtin_return_address(0));
> >
> > @@ -246,12 +251,41 @@ static unsigned long *alloc_thread_stack_node(struct 
> > task_struct *tsk, int node)
> >  #endif
> >  }
> >
> > +static void memcg_charge_kernel_stack(struct task_struct *tsk)
> > +{
> > +#ifdef CONFIG_VMAP_STACK
> > +   struct vm_struct *vm = task_stack_vm_area(tsk);
> > +
> > +   if (vm) {
> > +   int i;
> > +
> > +   for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
> > +   memcg_kmem_charge(vm->pages[i], __GFP_NOFAIL,
> > + compound_order(vm->pages[i]));
> > +
> > +   /* All stack pages belong to the same memcg. */
> > +   mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
> > +THREAD_SIZE / 1024);
> > +   }
> > +#endif
> > +}
> > +
> >  static inline void free_thread_stack(struct task_struct *tsk)
> >  {
> >  #ifdef CONFIG_VMAP_STACK
> > -   if (task_stack_vm_area(tsk)) {
> > +   struct vm_struct *vm = task_stack_vm_area(tsk);
> > +
> > +   if (vm) {
> > int i;
> >
> > +   /* All stack pages belong to the same memcg. */
> > +   mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
> > +-(int)(THREAD_SIZE / 1024));
> > +
> > +   for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
> > +   memcg_kmem_uncharge(vm->pages[i],
> > + compound_order(vm->pages[i]));
> > +
> > for (i = 0; i < NR_CACHED_STACKS; i++) {
> > if (this_cpu_cmpxchg(cached_stacks[i],
> > NULL, tsk->stack_vm_area) != NULL)
> > @@ -352,10 +386,6 @@ static void account_kernel_stack(struct task_struct 
> > *tsk, int account)
> > NR_KERNEL_STACK_KB,
> >

Re: [RFC PATCH 1/2] mm: rework memcg kernel stack accounting

2018-08-15 Thread Roman Gushchin
On Tue, Aug 14, 2018 at 06:18:01PM -0700, Shakeel Butt wrote:
> On Tue, Aug 14, 2018 at 5:37 PM Roman Gushchin  wrote:
> >
> > If CONFIG_VMAP_STACK is set, kernel stacks are allocated
> > using __vmalloc_node_range() with __GFP_ACCOUNT. So kernel
> > stack pages are charged against corresponding memory cgroups
> > on allocation and uncharged on releasing them.
> >
> > The problem is that we do cache kernel stacks in small
> > per-cpu caches and do reuse them for new tasks, which can
> > belong to different memory cgroups.
> >
> > Each stack page still holds a reference to the original cgroup,
> > so the cgroup can't be released until the vmap area is released.
> >
> > To make this happen we need more than two subsequent exits
> > without forks in between on the current cpu, which makes it
> > very unlikely to happen. As a result, I saw a significant number
> > of dying cgroups (in theory, up to 2 * number_of_cpu +
> > number_of_tasks), which can't be released even by significant
> > memory pressure.
> >
> > As a cgroup structure can take a significant amount of memory
> > (first of all, per-cpu data like memcg statistics), it leads
> > to a noticeable waste of memory.
> >
> > Signed-off-by: Roman Gushchin 
> 
> I was also looking into this issue. I was thinking of having a
> per-memcg per-cpu stack cache. However this solution seems much
> simpler.

I also thought about having per-memcg stack cache, but it seems
that caching 2 * n(cpus) * n(cgroups) stacks is an overkill,
and there is nothing memcg-specific in these stacks except
that they are pre-charged.

> Can you also add the performance number for a similar simple
> benchmark done in ac496bf48d97 ("fork: Optimize task creation by
> caching two thread stacks per CPU if CONFIG_VMAP_STACK=y").

Sure, will do in v2.

> 
> Reviewed-by: Shakeel Butt 

Thanks!

> 
> > Cc: Johannes Weiner 
> > Cc: Michal Hocko 
> > Cc: Andy Lutomirski 
> > Cc: Konstantin Khlebnikov 
> > Cc: Tejun Heo 
> > ---
> >  kernel/fork.c | 44 ++--
> >  1 file changed, 38 insertions(+), 6 deletions(-)
> >
> > diff --git a/kernel/fork.c b/kernel/fork.c
> > index 69b6fea5a181..91872b2b37bd 100644
> > --- a/kernel/fork.c
> > +++ b/kernel/fork.c
> > @@ -224,9 +224,14 @@ static unsigned long *alloc_thread_stack_node(struct 
> > task_struct *tsk, int node)
> > return s->addr;
> > }
> >
> > +   /*
> > +* Allocated stacks are cached and later reused by new threads,
> > +* so memcg accounting is performed manually on assigning/releasing
> > +* stacks to tasks. Drop __GFP_ACCOUNT.
> > +*/
> > stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
> >  VMALLOC_START, VMALLOC_END,
> > -THREADINFO_GFP,
> > +THREADINFO_GFP & ~__GFP_ACCOUNT,
> >  PAGE_KERNEL,
> >  0, node, __builtin_return_address(0));
> >
> > @@ -246,12 +251,41 @@ static unsigned long *alloc_thread_stack_node(struct 
> > task_struct *tsk, int node)
> >  #endif
> >  }
> >
> > +static void memcg_charge_kernel_stack(struct task_struct *tsk)
> > +{
> > +#ifdef CONFIG_VMAP_STACK
> > +   struct vm_struct *vm = task_stack_vm_area(tsk);
> > +
> > +   if (vm) {
> > +   int i;
> > +
> > +   for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
> > +   memcg_kmem_charge(vm->pages[i], __GFP_NOFAIL,
> > + compound_order(vm->pages[i]));
> > +
> > +   /* All stack pages belong to the same memcg. */
> > +   mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
> > +THREAD_SIZE / 1024);
> > +   }
> > +#endif
> > +}
> > +
> >  static inline void free_thread_stack(struct task_struct *tsk)
> >  {
> >  #ifdef CONFIG_VMAP_STACK
> > -   if (task_stack_vm_area(tsk)) {
> > +   struct vm_struct *vm = task_stack_vm_area(tsk);
> > +
> > +   if (vm) {
> > int i;
> >
> > +   /* All stack pages belong to the same memcg. */
> > +   mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
> > +-(int)(THREAD_SIZE / 1024));
> > +
> > +   for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
> > +   memcg_kmem_uncharge(vm->pages[i],
> > + compound_order(vm->pages[i]));
> > +
> > for (i = 0; i < NR_CACHED_STACKS; i++) {
> > if (this_cpu_cmpxchg(cached_stacks[i],
> > NULL, tsk->stack_vm_area) != NULL)
> > @@ -352,10 +386,6 @@ static void account_kernel_stack(struct task_struct 
> > *tsk, int account)
> > NR_KERNEL_STACK_KB,
> >

Re: [RFC PATCH 1/2] mm: rework memcg kernel stack accounting

2018-08-15 Thread Andy Lutomirski



> On Aug 15, 2018, at 9:55 AM, Roman Gushchin  wrote:
> 
>> On Wed, Aug 15, 2018 at 12:39:23PM -0400, Johannes Weiner wrote:
>>> On Tue, Aug 14, 2018 at 05:36:19PM -0700, Roman Gushchin wrote:
>>> @@ -224,9 +224,14 @@ static unsigned long *alloc_thread_stack_node(struct 
>>> task_struct *tsk, int node)
>>>return s->addr;
>>>}
>>> 
>>> +/*
>>> + * Allocated stacks are cached and later reused by new threads,
>>> + * so memcg accounting is performed manually on assigning/releasing
>>> + * stacks to tasks. Drop __GFP_ACCOUNT.
>>> + */
>>>stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
>>> VMALLOC_START, VMALLOC_END,
>>> - THREADINFO_GFP,
>>> + THREADINFO_GFP & ~__GFP_ACCOUNT,
>>> PAGE_KERNEL,
>>> 0, node, __builtin_return_address(0));
>>> 
>>> @@ -246,12 +251,41 @@ static unsigned long *alloc_thread_stack_node(struct 
>>> task_struct *tsk, int node)
>>> #endif
>>> }
>>> 
>>> +static void memcg_charge_kernel_stack(struct task_struct *tsk)
>>> +{
>>> +#ifdef CONFIG_VMAP_STACK
>>> +struct vm_struct *vm = task_stack_vm_area(tsk);
>>> +
>>> +if (vm) {
>>> +int i;
>>> +
>>> +for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
>>> +memcg_kmem_charge(vm->pages[i], __GFP_NOFAIL,
>>> +  compound_order(vm->pages[i]));
>>> +
>>> +/* All stack pages belong to the same memcg. */
>>> +mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
>>> + THREAD_SIZE / 1024);
>>> +}
>>> +#endif
>>> +}
>> 
>> Before this change, the memory limit can fail the fork, but afterwards
>> fork() can grow memory consumption unimpeded by the cgroup settings.
>> 
>> Can we continue to use try_charge() here and fail the fork?
> 
> We can, but I'm not convinced we should.
> 
> Kernel stack is relatively small, and it's already allocated at this point.
> So IMO exceeding the memcg limit for 1-2 pages isn't worse than
> adding complexity and handle this case (e.g. uncharge partially
> charged stack). Do you have an example, when it does matter?

What bounds it to just a few pages?  Couldn’t there be lots of forks in flight 
that all hit this path?  It’s unlikely, and there are surely easier DoS 
vectors, but still.

Re: [RFC PATCH 1/2] mm: rework memcg kernel stack accounting

2018-08-15 Thread Andy Lutomirski



> On Aug 15, 2018, at 9:55 AM, Roman Gushchin  wrote:
> 
>> On Wed, Aug 15, 2018 at 12:39:23PM -0400, Johannes Weiner wrote:
>>> On Tue, Aug 14, 2018 at 05:36:19PM -0700, Roman Gushchin wrote:
>>> @@ -224,9 +224,14 @@ static unsigned long *alloc_thread_stack_node(struct 
>>> task_struct *tsk, int node)
>>>return s->addr;
>>>}
>>> 
>>> +/*
>>> + * Allocated stacks are cached and later reused by new threads,
>>> + * so memcg accounting is performed manually on assigning/releasing
>>> + * stacks to tasks. Drop __GFP_ACCOUNT.
>>> + */
>>>stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
>>> VMALLOC_START, VMALLOC_END,
>>> - THREADINFO_GFP,
>>> + THREADINFO_GFP & ~__GFP_ACCOUNT,
>>> PAGE_KERNEL,
>>> 0, node, __builtin_return_address(0));
>>> 
>>> @@ -246,12 +251,41 @@ static unsigned long *alloc_thread_stack_node(struct 
>>> task_struct *tsk, int node)
>>> #endif
>>> }
>>> 
>>> +static void memcg_charge_kernel_stack(struct task_struct *tsk)
>>> +{
>>> +#ifdef CONFIG_VMAP_STACK
>>> +struct vm_struct *vm = task_stack_vm_area(tsk);
>>> +
>>> +if (vm) {
>>> +int i;
>>> +
>>> +for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
>>> +memcg_kmem_charge(vm->pages[i], __GFP_NOFAIL,
>>> +  compound_order(vm->pages[i]));
>>> +
>>> +/* All stack pages belong to the same memcg. */
>>> +mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
>>> + THREAD_SIZE / 1024);
>>> +}
>>> +#endif
>>> +}
>> 
>> Before this change, the memory limit can fail the fork, but afterwards
>> fork() can grow memory consumption unimpeded by the cgroup settings.
>> 
>> Can we continue to use try_charge() here and fail the fork?
> 
> We can, but I'm not convinced we should.
> 
> Kernel stack is relatively small, and it's already allocated at this point.
> So IMO exceeding the memcg limit for 1-2 pages isn't worse than
> adding complexity and handle this case (e.g. uncharge partially
> charged stack). Do you have an example, when it does matter?

What bounds it to just a few pages?  Couldn’t there be lots of forks in flight 
that all hit this path?  It’s unlikely, and there are surely easier DoS 
vectors, but still.

Re: [RFC PATCH 1/2] mm: rework memcg kernel stack accounting

2018-08-15 Thread Roman Gushchin
On Wed, Aug 15, 2018 at 12:39:23PM -0400, Johannes Weiner wrote:
> On Tue, Aug 14, 2018 at 05:36:19PM -0700, Roman Gushchin wrote:
> > @@ -224,9 +224,14 @@ static unsigned long *alloc_thread_stack_node(struct 
> > task_struct *tsk, int node)
> > return s->addr;
> > }
> >  
> > +   /*
> > +* Allocated stacks are cached and later reused by new threads,
> > +* so memcg accounting is performed manually on assigning/releasing
> > +* stacks to tasks. Drop __GFP_ACCOUNT.
> > +*/
> > stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
> >  VMALLOC_START, VMALLOC_END,
> > -THREADINFO_GFP,
> > +THREADINFO_GFP & ~__GFP_ACCOUNT,
> >  PAGE_KERNEL,
> >  0, node, __builtin_return_address(0));
> >  
> > @@ -246,12 +251,41 @@ static unsigned long *alloc_thread_stack_node(struct 
> > task_struct *tsk, int node)
> >  #endif
> >  }
> >  
> > +static void memcg_charge_kernel_stack(struct task_struct *tsk)
> > +{
> > +#ifdef CONFIG_VMAP_STACK
> > +   struct vm_struct *vm = task_stack_vm_area(tsk);
> > +
> > +   if (vm) {
> > +   int i;
> > +
> > +   for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
> > +   memcg_kmem_charge(vm->pages[i], __GFP_NOFAIL,
> > + compound_order(vm->pages[i]));
> > +
> > +   /* All stack pages belong to the same memcg. */
> > +   mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
> > +THREAD_SIZE / 1024);
> > +   }
> > +#endif
> > +}
> 
> Before this change, the memory limit can fail the fork, but afterwards
> fork() can grow memory consumption unimpeded by the cgroup settings.
> 
> Can we continue to use try_charge() here and fail the fork?

We can, but I'm not convinced we should.

Kernel stack is relatively small, and it's already allocated at this point.
So IMO exceeding the memcg limit for 1-2 pages isn't worse than
adding complexity and handle this case (e.g. uncharge partially
charged stack). Do you have an example, when it does matter?


Re: [RFC PATCH 1/2] mm: rework memcg kernel stack accounting

2018-08-15 Thread Roman Gushchin
On Wed, Aug 15, 2018 at 12:39:23PM -0400, Johannes Weiner wrote:
> On Tue, Aug 14, 2018 at 05:36:19PM -0700, Roman Gushchin wrote:
> > @@ -224,9 +224,14 @@ static unsigned long *alloc_thread_stack_node(struct 
> > task_struct *tsk, int node)
> > return s->addr;
> > }
> >  
> > +   /*
> > +* Allocated stacks are cached and later reused by new threads,
> > +* so memcg accounting is performed manually on assigning/releasing
> > +* stacks to tasks. Drop __GFP_ACCOUNT.
> > +*/
> > stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
> >  VMALLOC_START, VMALLOC_END,
> > -THREADINFO_GFP,
> > +THREADINFO_GFP & ~__GFP_ACCOUNT,
> >  PAGE_KERNEL,
> >  0, node, __builtin_return_address(0));
> >  
> > @@ -246,12 +251,41 @@ static unsigned long *alloc_thread_stack_node(struct 
> > task_struct *tsk, int node)
> >  #endif
> >  }
> >  
> > +static void memcg_charge_kernel_stack(struct task_struct *tsk)
> > +{
> > +#ifdef CONFIG_VMAP_STACK
> > +   struct vm_struct *vm = task_stack_vm_area(tsk);
> > +
> > +   if (vm) {
> > +   int i;
> > +
> > +   for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
> > +   memcg_kmem_charge(vm->pages[i], __GFP_NOFAIL,
> > + compound_order(vm->pages[i]));
> > +
> > +   /* All stack pages belong to the same memcg. */
> > +   mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
> > +THREAD_SIZE / 1024);
> > +   }
> > +#endif
> > +}
> 
> Before this change, the memory limit can fail the fork, but afterwards
> fork() can grow memory consumption unimpeded by the cgroup settings.
> 
> Can we continue to use try_charge() here and fail the fork?

We can, but I'm not convinced we should.

Kernel stack is relatively small, and it's already allocated at this point.
So IMO exceeding the memcg limit for 1-2 pages isn't worse than
adding complexity and handle this case (e.g. uncharge partially
charged stack). Do you have an example, when it does matter?


Re: [RFC PATCH 1/2] mm: rework memcg kernel stack accounting

2018-08-15 Thread Johannes Weiner
On Tue, Aug 14, 2018 at 05:36:19PM -0700, Roman Gushchin wrote:
> @@ -224,9 +224,14 @@ static unsigned long *alloc_thread_stack_node(struct 
> task_struct *tsk, int node)
>   return s->addr;
>   }
>  
> + /*
> +  * Allocated stacks are cached and later reused by new threads,
> +  * so memcg accounting is performed manually on assigning/releasing
> +  * stacks to tasks. Drop __GFP_ACCOUNT.
> +  */
>   stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
>VMALLOC_START, VMALLOC_END,
> -  THREADINFO_GFP,
> +  THREADINFO_GFP & ~__GFP_ACCOUNT,
>PAGE_KERNEL,
>0, node, __builtin_return_address(0));
>  
> @@ -246,12 +251,41 @@ static unsigned long *alloc_thread_stack_node(struct 
> task_struct *tsk, int node)
>  #endif
>  }
>  
> +static void memcg_charge_kernel_stack(struct task_struct *tsk)
> +{
> +#ifdef CONFIG_VMAP_STACK
> + struct vm_struct *vm = task_stack_vm_area(tsk);
> +
> + if (vm) {
> + int i;
> +
> + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
> + memcg_kmem_charge(vm->pages[i], __GFP_NOFAIL,
> +   compound_order(vm->pages[i]));
> +
> + /* All stack pages belong to the same memcg. */
> + mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
> +  THREAD_SIZE / 1024);
> + }
> +#endif
> +}

Before this change, the memory limit can fail the fork, but afterwards
fork() can grow memory consumption unimpeded by the cgroup settings.

Can we continue to use try_charge() here and fail the fork?


Re: [RFC PATCH 1/2] mm: rework memcg kernel stack accounting

2018-08-15 Thread Johannes Weiner
On Tue, Aug 14, 2018 at 05:36:19PM -0700, Roman Gushchin wrote:
> @@ -224,9 +224,14 @@ static unsigned long *alloc_thread_stack_node(struct 
> task_struct *tsk, int node)
>   return s->addr;
>   }
>  
> + /*
> +  * Allocated stacks are cached and later reused by new threads,
> +  * so memcg accounting is performed manually on assigning/releasing
> +  * stacks to tasks. Drop __GFP_ACCOUNT.
> +  */
>   stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
>VMALLOC_START, VMALLOC_END,
> -  THREADINFO_GFP,
> +  THREADINFO_GFP & ~__GFP_ACCOUNT,
>PAGE_KERNEL,
>0, node, __builtin_return_address(0));
>  
> @@ -246,12 +251,41 @@ static unsigned long *alloc_thread_stack_node(struct 
> task_struct *tsk, int node)
>  #endif
>  }
>  
> +static void memcg_charge_kernel_stack(struct task_struct *tsk)
> +{
> +#ifdef CONFIG_VMAP_STACK
> + struct vm_struct *vm = task_stack_vm_area(tsk);
> +
> + if (vm) {
> + int i;
> +
> + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
> + memcg_kmem_charge(vm->pages[i], __GFP_NOFAIL,
> +   compound_order(vm->pages[i]));
> +
> + /* All stack pages belong to the same memcg. */
> + mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
> +  THREAD_SIZE / 1024);
> + }
> +#endif
> +}

Before this change, the memory limit can fail the fork, but afterwards
fork() can grow memory consumption unimpeded by the cgroup settings.

Can we continue to use try_charge() here and fail the fork?


Re: [RFC PATCH 1/2] mm: rework memcg kernel stack accounting

2018-08-15 Thread Michal Hocko
On Tue 14-08-18 17:36:19, Roman Gushchin wrote:
> If CONFIG_VMAP_STACK is set, kernel stacks are allocated
> using __vmalloc_node_range() with __GFP_ACCOUNT. So kernel
> stack pages are charged against corresponding memory cgroups
> on allocation and uncharged on releasing them.
> 
> The problem is that we do cache kernel stacks in small
> per-cpu caches and do reuse them for new tasks, which can
> belong to different memory cgroups.
> 
> Each stack page still holds a reference to the original cgroup,
> so the cgroup can't be released until the vmap area is released.
> 
> To make this happen we need more than two subsequent exits
> without forks in between on the current cpu, which makes it
> very unlikely to happen. As a result, I saw a significant number
> of dying cgroups (in theory, up to 2 * number_of_cpu +
> number_of_tasks), which can't be released even by significant
> memory pressure.
> 
> As a cgroup structure can take a significant amount of memory
> (first of all, per-cpu data like memcg statistics), it leads
> to a noticeable waste of memory.
> 
> Signed-off-by: Roman Gushchin 

Fixes: ac496bf48d97 ("fork: Optimize task creation by caching two thread stacks 
per CPU if CONFIG_VMAP_STACK=y")
AFAICS

> Cc: Johannes Weiner 
> Cc: Michal Hocko 
> Cc: Andy Lutomirski 
> Cc: Konstantin Khlebnikov 
> Cc: Tejun Heo 

Yes this is the right way to do accounting here.
Acked-by: Michal Hocko 

Thanks!

> ---
>  kernel/fork.c | 44 ++--
>  1 file changed, 38 insertions(+), 6 deletions(-)
> 
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 69b6fea5a181..91872b2b37bd 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -224,9 +224,14 @@ static unsigned long *alloc_thread_stack_node(struct 
> task_struct *tsk, int node)
>   return s->addr;
>   }
>  
> + /*
> +  * Allocated stacks are cached and later reused by new threads,
> +  * so memcg accounting is performed manually on assigning/releasing
> +  * stacks to tasks. Drop __GFP_ACCOUNT.
> +  */
>   stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
>VMALLOC_START, VMALLOC_END,
> -  THREADINFO_GFP,
> +  THREADINFO_GFP & ~__GFP_ACCOUNT,
>PAGE_KERNEL,
>0, node, __builtin_return_address(0));
>  
> @@ -246,12 +251,41 @@ static unsigned long *alloc_thread_stack_node(struct 
> task_struct *tsk, int node)
>  #endif
>  }
>  
> +static void memcg_charge_kernel_stack(struct task_struct *tsk)
> +{
> +#ifdef CONFIG_VMAP_STACK
> + struct vm_struct *vm = task_stack_vm_area(tsk);
> +
> + if (vm) {
> + int i;
> +
> + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
> + memcg_kmem_charge(vm->pages[i], __GFP_NOFAIL,
> +   compound_order(vm->pages[i]));
> +
> + /* All stack pages belong to the same memcg. */
> + mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
> +  THREAD_SIZE / 1024);
> + }
> +#endif
> +}
> +
>  static inline void free_thread_stack(struct task_struct *tsk)
>  {
>  #ifdef CONFIG_VMAP_STACK
> - if (task_stack_vm_area(tsk)) {
> + struct vm_struct *vm = task_stack_vm_area(tsk);
> +
> + if (vm) {
>   int i;
>  
> + /* All stack pages belong to the same memcg. */
> + mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
> +  -(int)(THREAD_SIZE / 1024));
> +
> + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
> + memcg_kmem_uncharge(vm->pages[i],
> +   compound_order(vm->pages[i]));
> +
>   for (i = 0; i < NR_CACHED_STACKS; i++) {
>   if (this_cpu_cmpxchg(cached_stacks[i],
>   NULL, tsk->stack_vm_area) != NULL)
> @@ -352,10 +386,6 @@ static void account_kernel_stack(struct task_struct 
> *tsk, int account)
>   NR_KERNEL_STACK_KB,
>   PAGE_SIZE / 1024 * account);
>   }
> -
> - /* All stack pages belong to the same memcg. */
> - mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
> -  account * (THREAD_SIZE / 1024));
>   } else {
>   /*
>* All stack pages are in the same zone and belong to the
> @@ -809,6 +839,8 @@ static struct task_struct *dup_task_struct(struct 
> task_struct *orig, int node)
>   if (!stack)
>   goto free_tsk;
>  
> + memcg_charge_kernel_stack(tsk);
> +
>   stack_vm_area = task_stack_vm_area(tsk);
>  
>   err = arch_dup_task_struct(tsk, orig);
> -- 
> 2.14.4

-- 
Michal Hocko
SUSE Labs


Re: [RFC PATCH 1/2] mm: rework memcg kernel stack accounting

2018-08-15 Thread Michal Hocko
On Tue 14-08-18 17:36:19, Roman Gushchin wrote:
> If CONFIG_VMAP_STACK is set, kernel stacks are allocated
> using __vmalloc_node_range() with __GFP_ACCOUNT. So kernel
> stack pages are charged against corresponding memory cgroups
> on allocation and uncharged on releasing them.
> 
> The problem is that we do cache kernel stacks in small
> per-cpu caches and do reuse them for new tasks, which can
> belong to different memory cgroups.
> 
> Each stack page still holds a reference to the original cgroup,
> so the cgroup can't be released until the vmap area is released.
> 
> To make this happen we need more than two subsequent exits
> without forks in between on the current cpu, which makes it
> very unlikely to happen. As a result, I saw a significant number
> of dying cgroups (in theory, up to 2 * number_of_cpu +
> number_of_tasks), which can't be released even by significant
> memory pressure.
> 
> As a cgroup structure can take a significant amount of memory
> (first of all, per-cpu data like memcg statistics), it leads
> to a noticeable waste of memory.
> 
> Signed-off-by: Roman Gushchin 

Fixes: ac496bf48d97 ("fork: Optimize task creation by caching two thread stacks 
per CPU if CONFIG_VMAP_STACK=y")
AFAICS

> Cc: Johannes Weiner 
> Cc: Michal Hocko 
> Cc: Andy Lutomirski 
> Cc: Konstantin Khlebnikov 
> Cc: Tejun Heo 

Yes this is the right way to do accounting here.
Acked-by: Michal Hocko 

Thanks!

> ---
>  kernel/fork.c | 44 ++--
>  1 file changed, 38 insertions(+), 6 deletions(-)
> 
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 69b6fea5a181..91872b2b37bd 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -224,9 +224,14 @@ static unsigned long *alloc_thread_stack_node(struct 
> task_struct *tsk, int node)
>   return s->addr;
>   }
>  
> + /*
> +  * Allocated stacks are cached and later reused by new threads,
> +  * so memcg accounting is performed manually on assigning/releasing
> +  * stacks to tasks. Drop __GFP_ACCOUNT.
> +  */
>   stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
>VMALLOC_START, VMALLOC_END,
> -  THREADINFO_GFP,
> +  THREADINFO_GFP & ~__GFP_ACCOUNT,
>PAGE_KERNEL,
>0, node, __builtin_return_address(0));
>  
> @@ -246,12 +251,41 @@ static unsigned long *alloc_thread_stack_node(struct 
> task_struct *tsk, int node)
>  #endif
>  }
>  
> +static void memcg_charge_kernel_stack(struct task_struct *tsk)
> +{
> +#ifdef CONFIG_VMAP_STACK
> + struct vm_struct *vm = task_stack_vm_area(tsk);
> +
> + if (vm) {
> + int i;
> +
> + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
> + memcg_kmem_charge(vm->pages[i], __GFP_NOFAIL,
> +   compound_order(vm->pages[i]));
> +
> + /* All stack pages belong to the same memcg. */
> + mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
> +  THREAD_SIZE / 1024);
> + }
> +#endif
> +}
> +
>  static inline void free_thread_stack(struct task_struct *tsk)
>  {
>  #ifdef CONFIG_VMAP_STACK
> - if (task_stack_vm_area(tsk)) {
> + struct vm_struct *vm = task_stack_vm_area(tsk);
> +
> + if (vm) {
>   int i;
>  
> + /* All stack pages belong to the same memcg. */
> + mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
> +  -(int)(THREAD_SIZE / 1024));
> +
> + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
> + memcg_kmem_uncharge(vm->pages[i],
> +   compound_order(vm->pages[i]));
> +
>   for (i = 0; i < NR_CACHED_STACKS; i++) {
>   if (this_cpu_cmpxchg(cached_stacks[i],
>   NULL, tsk->stack_vm_area) != NULL)
> @@ -352,10 +386,6 @@ static void account_kernel_stack(struct task_struct 
> *tsk, int account)
>   NR_KERNEL_STACK_KB,
>   PAGE_SIZE / 1024 * account);
>   }
> -
> - /* All stack pages belong to the same memcg. */
> - mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
> -  account * (THREAD_SIZE / 1024));
>   } else {
>   /*
>* All stack pages are in the same zone and belong to the
> @@ -809,6 +839,8 @@ static struct task_struct *dup_task_struct(struct 
> task_struct *orig, int node)
>   if (!stack)
>   goto free_tsk;
>  
> + memcg_charge_kernel_stack(tsk);
> +
>   stack_vm_area = task_stack_vm_area(tsk);
>  
>   err = arch_dup_task_struct(tsk, orig);
> -- 
> 2.14.4

-- 
Michal Hocko
SUSE Labs


Re: [RFC PATCH 1/2] mm: rework memcg kernel stack accounting

2018-08-14 Thread Shakeel Butt
On Tue, Aug 14, 2018 at 5:37 PM Roman Gushchin  wrote:
>
> If CONFIG_VMAP_STACK is set, kernel stacks are allocated
> using __vmalloc_node_range() with __GFP_ACCOUNT. So kernel
> stack pages are charged against corresponding memory cgroups
> on allocation and uncharged on releasing them.
>
> The problem is that we do cache kernel stacks in small
> per-cpu caches and do reuse them for new tasks, which can
> belong to different memory cgroups.
>
> Each stack page still holds a reference to the original cgroup,
> so the cgroup can't be released until the vmap area is released.
>
> To make this happen we need more than two subsequent exits
> without forks in between on the current cpu, which makes it
> very unlikely to happen. As a result, I saw a significant number
> of dying cgroups (in theory, up to 2 * number_of_cpu +
> number_of_tasks), which can't be released even by significant
> memory pressure.
>
> As a cgroup structure can take a significant amount of memory
> (first of all, per-cpu data like memcg statistics), it leads
> to a noticeable waste of memory.
>
> Signed-off-by: Roman Gushchin 

I was also looking into this issue. I was thinking of having a
per-memcg per-cpu stack cache. However this solution seems much
simpler. Can you also add the performance number for a similar simple
benchmark done in ac496bf48d97 ("fork: Optimize task creation by
caching two thread stacks per CPU if CONFIG_VMAP_STACK=y").

Reviewed-by: Shakeel Butt 

> Cc: Johannes Weiner 
> Cc: Michal Hocko 
> Cc: Andy Lutomirski 
> Cc: Konstantin Khlebnikov 
> Cc: Tejun Heo 
> ---
>  kernel/fork.c | 44 ++--
>  1 file changed, 38 insertions(+), 6 deletions(-)
>
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 69b6fea5a181..91872b2b37bd 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -224,9 +224,14 @@ static unsigned long *alloc_thread_stack_node(struct 
> task_struct *tsk, int node)
> return s->addr;
> }
>
> +   /*
> +* Allocated stacks are cached and later reused by new threads,
> +* so memcg accounting is performed manually on assigning/releasing
> +* stacks to tasks. Drop __GFP_ACCOUNT.
> +*/
> stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
>  VMALLOC_START, VMALLOC_END,
> -THREADINFO_GFP,
> +THREADINFO_GFP & ~__GFP_ACCOUNT,
>  PAGE_KERNEL,
>  0, node, __builtin_return_address(0));
>
> @@ -246,12 +251,41 @@ static unsigned long *alloc_thread_stack_node(struct 
> task_struct *tsk, int node)
>  #endif
>  }
>
> +static void memcg_charge_kernel_stack(struct task_struct *tsk)
> +{
> +#ifdef CONFIG_VMAP_STACK
> +   struct vm_struct *vm = task_stack_vm_area(tsk);
> +
> +   if (vm) {
> +   int i;
> +
> +   for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
> +   memcg_kmem_charge(vm->pages[i], __GFP_NOFAIL,
> + compound_order(vm->pages[i]));
> +
> +   /* All stack pages belong to the same memcg. */
> +   mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
> +THREAD_SIZE / 1024);
> +   }
> +#endif
> +}
> +
>  static inline void free_thread_stack(struct task_struct *tsk)
>  {
>  #ifdef CONFIG_VMAP_STACK
> -   if (task_stack_vm_area(tsk)) {
> +   struct vm_struct *vm = task_stack_vm_area(tsk);
> +
> +   if (vm) {
> int i;
>
> +   /* All stack pages belong to the same memcg. */
> +   mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
> +-(int)(THREAD_SIZE / 1024));
> +
> +   for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
> +   memcg_kmem_uncharge(vm->pages[i],
> + compound_order(vm->pages[i]));
> +
> for (i = 0; i < NR_CACHED_STACKS; i++) {
> if (this_cpu_cmpxchg(cached_stacks[i],
> NULL, tsk->stack_vm_area) != NULL)
> @@ -352,10 +386,6 @@ static void account_kernel_stack(struct task_struct 
> *tsk, int account)
> NR_KERNEL_STACK_KB,
> PAGE_SIZE / 1024 * account);
> }
> -
> -   /* All stack pages belong to the same memcg. */
> -   mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
> -account * (THREAD_SIZE / 1024));
> } else {
> /*
>  * All stack pages are in the same zone and belong to the
> @@ -809,6 +839,8 @@ static struct task_struct *dup_task_struct(struct 
> task_struct *orig, int node)
> if (!stack)
> 

Re: [RFC PATCH 1/2] mm: rework memcg kernel stack accounting

2018-08-14 Thread Shakeel Butt
On Tue, Aug 14, 2018 at 5:37 PM Roman Gushchin  wrote:
>
> If CONFIG_VMAP_STACK is set, kernel stacks are allocated
> using __vmalloc_node_range() with __GFP_ACCOUNT. So kernel
> stack pages are charged against corresponding memory cgroups
> on allocation and uncharged on releasing them.
>
> The problem is that we do cache kernel stacks in small
> per-cpu caches and do reuse them for new tasks, which can
> belong to different memory cgroups.
>
> Each stack page still holds a reference to the original cgroup,
> so the cgroup can't be released until the vmap area is released.
>
> To make this happen we need more than two subsequent exits
> without forks in between on the current cpu, which makes it
> very unlikely to happen. As a result, I saw a significant number
> of dying cgroups (in theory, up to 2 * number_of_cpu +
> number_of_tasks), which can't be released even by significant
> memory pressure.
>
> As a cgroup structure can take a significant amount of memory
> (first of all, per-cpu data like memcg statistics), it leads
> to a noticeable waste of memory.
>
> Signed-off-by: Roman Gushchin 

I was also looking into this issue. I was thinking of having a
per-memcg per-cpu stack cache. However this solution seems much
simpler. Can you also add the performance number for a similar simple
benchmark done in ac496bf48d97 ("fork: Optimize task creation by
caching two thread stacks per CPU if CONFIG_VMAP_STACK=y").

Reviewed-by: Shakeel Butt 

> Cc: Johannes Weiner 
> Cc: Michal Hocko 
> Cc: Andy Lutomirski 
> Cc: Konstantin Khlebnikov 
> Cc: Tejun Heo 
> ---
>  kernel/fork.c | 44 ++--
>  1 file changed, 38 insertions(+), 6 deletions(-)
>
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 69b6fea5a181..91872b2b37bd 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -224,9 +224,14 @@ static unsigned long *alloc_thread_stack_node(struct 
> task_struct *tsk, int node)
> return s->addr;
> }
>
> +   /*
> +* Allocated stacks are cached and later reused by new threads,
> +* so memcg accounting is performed manually on assigning/releasing
> +* stacks to tasks. Drop __GFP_ACCOUNT.
> +*/
> stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
>  VMALLOC_START, VMALLOC_END,
> -THREADINFO_GFP,
> +THREADINFO_GFP & ~__GFP_ACCOUNT,
>  PAGE_KERNEL,
>  0, node, __builtin_return_address(0));
>
> @@ -246,12 +251,41 @@ static unsigned long *alloc_thread_stack_node(struct 
> task_struct *tsk, int node)
>  #endif
>  }
>
> +static void memcg_charge_kernel_stack(struct task_struct *tsk)
> +{
> +#ifdef CONFIG_VMAP_STACK
> +   struct vm_struct *vm = task_stack_vm_area(tsk);
> +
> +   if (vm) {
> +   int i;
> +
> +   for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
> +   memcg_kmem_charge(vm->pages[i], __GFP_NOFAIL,
> + compound_order(vm->pages[i]));
> +
> +   /* All stack pages belong to the same memcg. */
> +   mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
> +THREAD_SIZE / 1024);
> +   }
> +#endif
> +}
> +
>  static inline void free_thread_stack(struct task_struct *tsk)
>  {
>  #ifdef CONFIG_VMAP_STACK
> -   if (task_stack_vm_area(tsk)) {
> +   struct vm_struct *vm = task_stack_vm_area(tsk);
> +
> +   if (vm) {
> int i;
>
> +   /* All stack pages belong to the same memcg. */
> +   mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
> +-(int)(THREAD_SIZE / 1024));
> +
> +   for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
> +   memcg_kmem_uncharge(vm->pages[i],
> + compound_order(vm->pages[i]));
> +
> for (i = 0; i < NR_CACHED_STACKS; i++) {
> if (this_cpu_cmpxchg(cached_stacks[i],
> NULL, tsk->stack_vm_area) != NULL)
> @@ -352,10 +386,6 @@ static void account_kernel_stack(struct task_struct 
> *tsk, int account)
> NR_KERNEL_STACK_KB,
> PAGE_SIZE / 1024 * account);
> }
> -
> -   /* All stack pages belong to the same memcg. */
> -   mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
> -account * (THREAD_SIZE / 1024));
> } else {
> /*
>  * All stack pages are in the same zone and belong to the
> @@ -809,6 +839,8 @@ static struct task_struct *dup_task_struct(struct 
> task_struct *orig, int node)
> if (!stack)
>