from:"Glauber Costa"

Re: [PATCH] cfq: fix starvation of asynchronous writes

2016-09-23 Thread Glauber Costa

On Fri, Sep 23, 2016 at 7:28 AM, Glauber Costa <glau...@scylladb.com> wrote:
> On Sep 23, 2016 6:22 AM, "Paolo Valente" <paolo.vale...@unimore.it> wrote:
>>
>>
>> > Il giorno 23 set 2016, alle ore 02:59, Glauber Costa
>> > <glau...@scylladb.com> ha scritto:
>> >
>> > While debugging timeouts happening in my application workload
>> > (ScyllaDB), I have
>> > observed calls to open() taking a long time, ranging everywhere from 2
>> > seconds -
>> > the first ones that are enough to time out my application - to more than
>> > 30
>> > seconds.
>> >
>> > The problem seems to happen because XFS may block on pending metadata
>> > updates
>> > under certain circumnstances, and that's confirmed with the following
>> > backtrace
>> > taken by the offcputime tool (iovisor/bcc):
>> >
>> >b90c57b1 finish_task_switch
>> >b97dffb5 schedule
>> >b97e310c schedule_timeout
>> >b97e1f12 __down
>> >b90ea821 down
>> >c046a9dc xfs_buf_lock
>> >c046abfb _xfs_buf_find
>> >c046ae4a xfs_buf_get_map
>> >c046babd xfs_buf_read_map
>> >c0499931 xfs_trans_read_buf_map
>> >c044a561 xfs_da_read_buf
>> >c0451390 xfs_dir3_leaf_read.constprop.16
>> >c0452b90 xfs_dir2_leaf_lookup_int
>> >c0452e0f xfs_dir2_leaf_lookup
>> >c044d9d3 xfs_dir_lookup
>> >c047d1d9 xfs_lookup
>> >c0479e53 xfs_vn_lookup
>> >b925347a path_openat
>> >b9254a71 do_filp_open
>> >b9242a94 do_sys_open
>> >b9242b9e sys_open
>> >b97e42b2 entry_SYSCALL_64_fastpath
>> >7fb0698162ed [unknown]
>> >
>> > Inspecting my run with blktrace, I can see that the xfsaild kthread
>> > exhibit very
>> > high "Dispatch wait" times, on the dozens of seconds range and
>> > consistent with
>> > the open() times I have saw in that run.
>> >
>> > Still from the blktrace output, we can after searching a bit, identify
>> > the
>> > request that wasn't dispatched:
>> >
>> >  8,0   11  15281.092472813   804  A  WM 141698288 + 8 <- (8,1)
>> > 141696240
>> >  8,0   11  15381.092472889   804  Q  WM 141698288 + 8
>> > [xfsaild/sda1]
>> >  8,0   11  15481.092473207   804  G  WM 141698288 + 8
>> > [xfsaild/sda1]
>> >  8,0   11  20681.092496118   804  I  WM 141698288 + 8 (   22911)
>> > [xfsaild/sda1]
>> >  < 'I' means Inserted (into the IO scheduler)
>> > ===>
>> >  8,00   28937296.718761435 0  D  WM 141698288 + 8
>> > (15626265317) [swapper/0]
>> >  < Only 15s later the CFQ scheduler dispatches the request
>> > ==>
>> >
>> > As we can see above, in this particular example CFQ took 15 seconds to
>> > dispatch
>> > this request. Going back to the full trace, we can see that the xfsaild
>> > queue
>> > had plenty of opportunity to run, and it was selected as the active
>> > queue many
>> > times. It would just always be preempted by something else (example):
>> >
>> >  8,01081.117912979 0  m   N cfq1618SN /
>> > insert_request
>> >  8,01081.117913419 0  m   N cfq1618SN / add_to_rr
>> >  8,01081.117914044 0  m   N cfq1618SN / preempt
>> >  8,01081.117914398 0  m   N cfq767A  / slice expired
>> > t=1
>> >  8,01081.117914755 0  m   N cfq767A  / resid=40
>> >  8,01081.117915340 0  m   N / served: vt=1948520448
>> > min_vt=1948520448
>> >  8,01081.117915858 0  m   N cfq767A  / sl_used=1
>> > disp=0 charge=0 iops=1 sect=0
>> >
>> > where cfq767 is the xfsaild queue and cfq1618 corresponds to one of the
>> > ScyllaDB
>> > IO dispatchers.
>> >
>> > The requests preempting the xfsaild queue are synchronous requests.
>> > That's a
>> > characteristic of ScyllaDB workloads, as we only ever issue O_DIRECT
>> > requests.
>> > While it can be argued that preempting ASYNC requests in favor of SYNC
>> >

Re: [PATCH] cfq: fix starvation of asynchronous writes

2016-09-23 Thread Glauber Costa

On Fri, Sep 23, 2016 at 7:28 AM, Glauber Costa  wrote:
> On Sep 23, 2016 6:22 AM, "Paolo Valente"  wrote:
>>
>>
>> > Il giorno 23 set 2016, alle ore 02:59, Glauber Costa
>> >  ha scritto:
>> >
>> > While debugging timeouts happening in my application workload
>> > (ScyllaDB), I have
>> > observed calls to open() taking a long time, ranging everywhere from 2
>> > seconds -
>> > the first ones that are enough to time out my application - to more than
>> > 30
>> > seconds.
>> >
>> > The problem seems to happen because XFS may block on pending metadata
>> > updates
>> > under certain circumnstances, and that's confirmed with the following
>> > backtrace
>> > taken by the offcputime tool (iovisor/bcc):
>> >
>> >b90c57b1 finish_task_switch
>> >b97dffb5 schedule
>> >b97e310c schedule_timeout
>> >b97e1f12 __down
>> >b90ea821 down
>> >c046a9dc xfs_buf_lock
>> >c046abfb _xfs_buf_find
>> >c046ae4a xfs_buf_get_map
>> >c046babd xfs_buf_read_map
>> >c0499931 xfs_trans_read_buf_map
>> >c044a561 xfs_da_read_buf
>> >c0451390 xfs_dir3_leaf_read.constprop.16
>> >c0452b90 xfs_dir2_leaf_lookup_int
>> >c0452e0f xfs_dir2_leaf_lookup
>> >c044d9d3 xfs_dir_lookup
>> >c047d1d9 xfs_lookup
>> >c0479e53 xfs_vn_lookup
>> >b925347a path_openat
>> >b9254a71 do_filp_open
>> >b9242a94 do_sys_open
>> >b9242b9e sys_open
>> >b97e42b2 entry_SYSCALL_64_fastpath
>> >7fb0698162ed [unknown]
>> >
>> > Inspecting my run with blktrace, I can see that the xfsaild kthread
>> > exhibit very
>> > high "Dispatch wait" times, on the dozens of seconds range and
>> > consistent with
>> > the open() times I have saw in that run.
>> >
>> > Still from the blktrace output, we can after searching a bit, identify
>> > the
>> > request that wasn't dispatched:
>> >
>> >  8,0   11  15281.092472813   804  A  WM 141698288 + 8 <- (8,1)
>> > 141696240
>> >  8,0   11  15381.092472889   804  Q  WM 141698288 + 8
>> > [xfsaild/sda1]
>> >  8,0   11  15481.092473207   804  G  WM 141698288 + 8
>> > [xfsaild/sda1]
>> >  8,0   11  20681.092496118   804  I  WM 141698288 + 8 (   22911)
>> > [xfsaild/sda1]
>> >  < 'I' means Inserted (into the IO scheduler)
>> > ===>
>> >  8,00   28937296.718761435 0  D  WM 141698288 + 8
>> > (15626265317) [swapper/0]
>> >  < Only 15s later the CFQ scheduler dispatches the request
>> > ==>
>> >
>> > As we can see above, in this particular example CFQ took 15 seconds to
>> > dispatch
>> > this request. Going back to the full trace, we can see that the xfsaild
>> > queue
>> > had plenty of opportunity to run, and it was selected as the active
>> > queue many
>> > times. It would just always be preempted by something else (example):
>> >
>> >  8,01081.117912979 0  m   N cfq1618SN /
>> > insert_request
>> >  8,01081.117913419 0  m   N cfq1618SN / add_to_rr
>> >  8,01081.117914044 0  m   N cfq1618SN / preempt
>> >  8,01081.117914398 0  m   N cfq767A  / slice expired
>> > t=1
>> >  8,01081.117914755 0  m   N cfq767A  / resid=40
>> >  8,01081.117915340 0  m   N / served: vt=1948520448
>> > min_vt=1948520448
>> >  8,01081.117915858 0  m   N cfq767A  / sl_used=1
>> > disp=0 charge=0 iops=1 sect=0
>> >
>> > where cfq767 is the xfsaild queue and cfq1618 corresponds to one of the
>> > ScyllaDB
>> > IO dispatchers.
>> >
>> > The requests preempting the xfsaild queue are synchronous requests.
>> > That's a
>> > characteristic of ScyllaDB workloads, as we only ever issue O_DIRECT
>> > requests.
>> > While it can be argued that preempting ASYNC requests in favor of SYNC
>> > is part
>> > of the CFQ logic, I don't believe that doing so for 15+ sec

[PATCH] cfq: fix starvation of asynchronous writes

2016-09-22 Thread Glauber Costa

@  1966
 4 |@54
 8 | 36
16 |  7
32 |  0
64 |  0
   ~
  1024 |  0
  2048 |  0
  4096 |  1
  8192 |  1
 16384 |  2
 32768 |  0
 65536 |  0
131072 |  1
262144 |  0
524288 |          0

Signed-off-by: Glauber Costa <glau...@scylladb.com>
CC: Jens Axboe <ax...@kernel.dk>
CC: linux-bl...@vger.kernel.org
CC: linux-kernel@vger.kernel.org

Signed-off-by: Glauber Costa <glau...@scylladb.com>
---
 block/cfq-iosched.c | 13 ++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index cc2f6db..5e24d88 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3042,7 +3042,6 @@ static struct request *cfq_check_fifo(struct cfq_queue 
*cfqq)
if (ktime_get_ns() < rq->fifo_time)
rq = NULL;
 
-   cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);
return rq;
 }
 
@@ -3420,6 +3419,9 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, 
struct cfq_queue *cfqq)
 {
unsigned int max_dispatch;
 
+   if (cfq_cfqq_must_dispatch(cfqq))
+   return true;
+
/*
 * Drain async requests before we start sync IO
 */
@@ -3511,15 +3513,20 @@ static bool cfq_dispatch_request(struct cfq_data *cfqd, 
struct cfq_queue *cfqq)
 
BUG_ON(RB_EMPTY_ROOT(>sort_list));
 
+   rq = cfq_check_fifo(cfqq);
+   if (rq)
+   cfq_mark_cfqq_must_dispatch(cfqq);
+
if (!cfq_may_dispatch(cfqd, cfqq))
return false;
 
/*
 * follow expired path, else get first next available
 */
-   rq = cfq_check_fifo(cfqq);
if (!rq)
rq = cfqq->next_rq;
+   else
+   cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);
 
/*
 * insert request into driver dispatch list
@@ -3989,7 +3996,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct 
cfq_queue *new_cfqq,
 * if the new request is sync, but the currently running queue is
 * not, let the sync request have priority.
 */
-   if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))
+   if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq) && 
!cfq_cfqq_must_dispatch(cfqq))
return true;
 
/*
-- 
2.5.5

[PATCH] cfq: fix starvation of asynchronous writes

2016-09-22 Thread Glauber Costa

@  1966
 4 |@54
 8 | 36
16 |  7
32 |  0
64 |  0
   ~
  1024 |  0
  2048 |  0
  4096 |  1
  8192 |  1
 16384 |  2
 32768 |  0
 65536 |  0
131072 |  1
262144 |  0
524288 |          0

Signed-off-by: Glauber Costa 
CC: Jens Axboe 
CC: linux-bl...@vger.kernel.org
CC: linux-kernel@vger.kernel.org

Signed-off-by: Glauber Costa 
---
 block/cfq-iosched.c | 13 ++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index cc2f6db..5e24d88 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3042,7 +3042,6 @@ static struct request *cfq_check_fifo(struct cfq_queue 
*cfqq)
if (ktime_get_ns() < rq->fifo_time)
rq = NULL;
 
-   cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);
return rq;
 }
 
@@ -3420,6 +3419,9 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, 
struct cfq_queue *cfqq)
 {
unsigned int max_dispatch;
 
+   if (cfq_cfqq_must_dispatch(cfqq))
+   return true;
+
/*
 * Drain async requests before we start sync IO
 */
@@ -3511,15 +3513,20 @@ static bool cfq_dispatch_request(struct cfq_data *cfqd, 
struct cfq_queue *cfqq)
 
BUG_ON(RB_EMPTY_ROOT(>sort_list));
 
+   rq = cfq_check_fifo(cfqq);
+   if (rq)
+   cfq_mark_cfqq_must_dispatch(cfqq);
+
if (!cfq_may_dispatch(cfqd, cfqq))
return false;
 
/*
 * follow expired path, else get first next available
 */
-   rq = cfq_check_fifo(cfqq);
if (!rq)
rq = cfqq->next_rq;
+   else
+   cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);
 
/*
 * insert request into driver dispatch list
@@ -3989,7 +3996,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct 
cfq_queue *new_cfqq,
 * if the new request is sync, but the currently running queue is
 * not, let the sync request have priority.
 */
-   if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))
+   if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq) && 
!cfq_cfqq_must_dispatch(cfqq))
return true;
 
/*
-- 
2.5.5

Re: [BUG] Paravirtual time accounting / IRQ time accounting

2014-03-20 Thread Glauber Costa

On Wed, Mar 19, 2014 at 6:42 AM,   wrote:
> In consolidated environments, when there are multiple virtual machines (VMs)
> running on one CPU core, timekeeping will be a problem to the guest OS.
> Here, I report my findings about Linux process scheduler.
>
>
> Description
> 
> Linux CFS relies on rq->clock_task to charge each task, determine vruntime,
> etc.
>
> When CONFIG_IRQ_TIME_ACCOUNTING is enabled, the time spent on serving IRQ
> will be excluded from updating rq->clock_task.
> When CONFIG_PARAVIRT_TIME_ACCOUNTING is enabled, the time stolen by the
> hypervisor
> will also be excluded from updating rq->clock_task.
>
> With "both" CONFIG_IRQ_TIME_ACCOUNTING and CONFIG_PARAVIRT_TIME_ACCOUNTING
> enabled,
> I put three KVM guests on one core and run hackbench in each guest. I find
> that
> in the guests, rq->clock_task stays *unchanged*. The malfunction embarrasses
> CFS.
> 
>
>
> Analysis
> 
> [src/kernel/sched/core.c]
> static void update_rq_clock_task(struct rq *rq, s64 delta)
> {
> ... ...
> #ifdef CONFIG_IRQ_TIME_ACCOUNTING
> irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
> ... ...
> rq->prev_irq_time += irq_delta;
> delta -= irq_delta;
> #endif
>
> #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
> if (static_key_false((_steal_rq_enabled))) {
> steal = paravirt_steal_clock(cpu_of(rq));
> steal -= rq->prev_steal_time_rq;
> ... ...
> rq->prev_steal_time_rq += steal;
> delta -= steal;
> }
> #endif
>
> rq->clock_task += delta;
> ... ...
> }
> --
> "delta" -> the intended increment to rq->clock_task
> "irq_delta" -> the time spent on serving IRQ (hard + soft)
> "steal" -> the time stolen by the underlying hypervisor
> --
> "irq_delta" is calculated based on sched_clock_cpu(), which is vulnerable
> to VM scheduling delays.

This looks like a real problem indeed. The main problem in searching
for a solution, is that of course not all of the irq time is steal
time and vice versa. In this case, we could subtract irq_time from
steal, and add only the steal part time that is in excess. I don't
think this is 100 % guaranteed, but maybe it is a good approximation.

Rik, do you have an opinion on this ?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [BUG] Paravirtual time accounting / IRQ time accounting

2014-03-20 Thread Glauber Costa

On Wed, Mar 19, 2014 at 6:42 AM,  lwch...@cs.hku.hk wrote:
 In consolidated environments, when there are multiple virtual machines (VMs)
 running on one CPU core, timekeeping will be a problem to the guest OS.
 Here, I report my findings about Linux process scheduler.


 Description
 
 Linux CFS relies on rq-clock_task to charge each task, determine vruntime,
 etc.

 When CONFIG_IRQ_TIME_ACCOUNTING is enabled, the time spent on serving IRQ
 will be excluded from updating rq-clock_task.
 When CONFIG_PARAVIRT_TIME_ACCOUNTING is enabled, the time stolen by the
 hypervisor
 will also be excluded from updating rq-clock_task.

 With both CONFIG_IRQ_TIME_ACCOUNTING and CONFIG_PARAVIRT_TIME_ACCOUNTING
 enabled,
 I put three KVM guests on one core and run hackbench in each guest. I find
 that
 in the guests, rq-clock_task stays *unchanged*. The malfunction embarrasses
 CFS.
 


 Analysis
 
 [src/kernel/sched/core.c]
 static void update_rq_clock_task(struct rq *rq, s64 delta)
 {
 ... ...
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 irq_delta = irq_time_read(cpu_of(rq)) - rq-prev_irq_time;
 ... ...
 rq-prev_irq_time += irq_delta;
 delta -= irq_delta;
 #endif

 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
 if (static_key_false((paravirt_steal_rq_enabled))) {
 steal = paravirt_steal_clock(cpu_of(rq));
 steal -= rq-prev_steal_time_rq;
 ... ...
 rq-prev_steal_time_rq += steal;
 delta -= steal;
 }
 #endif

 rq-clock_task += delta;
 ... ...
 }
 --
 delta - the intended increment to rq-clock_task
 irq_delta - the time spent on serving IRQ (hard + soft)
 steal - the time stolen by the underlying hypervisor
 --
 irq_delta is calculated based on sched_clock_cpu(), which is vulnerable
 to VM scheduling delays.

This looks like a real problem indeed. The main problem in searching
for a solution, is that of course not all of the irq time is steal
time and vice versa. In this case, we could subtract irq_time from
steal, and add only the steal part time that is in excess. I don't
think this is 100 % guaranteed, but maybe it is a good approximation.

Rik, do you have an opinion on this ?
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 3/8] memcg, slab: never try to merge memcg caches

2014-02-04 Thread Glauber Costa

On Tue, Feb 4, 2014 at 8:04 PM, Vladimir Davydov  wrote:
> On 02/04/2014 07:43 PM, Glauber Costa wrote:
>> On Tue, Feb 4, 2014 at 7:27 PM, Vladimir Davydov  
>> wrote:
>>> On 02/04/2014 07:11 PM, Michal Hocko wrote:
>>>> On Tue 04-02-14 18:59:23, Vladimir Davydov wrote:
>>>>> On 02/04/2014 06:52 PM, Michal Hocko wrote:
>>>>>> On Sun 02-02-14 20:33:48, Vladimir Davydov wrote:
>>>>>>> Suppose we are creating memcg cache A that could be merged with cache B
>>>>>>> of the same memcg. Since any memcg cache has the same parameters as its
>>>>>>> parent cache, parent caches PA and PB of memcg caches A and B must be
>>>>>>> mergeable too. That means PA was merged with PB on creation or vice
>>>>>>> versa, i.e. PA = PB. From that it follows that A = B, and we couldn't
>>>>>>> even try to create cache B, because it already exists - a contradiction.
>>>>>> I cannot tell I understand the above but I am totally not sure about the
>>>>>> statement bellow.
>>>>>>
>>>>>>> So let's remove unused code responsible for merging memcg caches.
>>>>>> How come the code was unused? find_mergeable called cache_match_memcg...
>>>>> Oh, sorry for misleading comment. I mean the code handling merging of
>>>>> per-memcg caches is useless, AFAIU: if we find an alias for a per-memcg
>>>>> cache on kmem_cache_create_memcg(), the parent of the found alias must
>>>>> be the same as the parent_cache passed to kmem_cache_create_memcg(), but
>>>>> if it were so, we would never proceed to the memcg cache creation,
>>>>> because the cache we want to create already exists.
>>>> I am still not sure I understand this correctly. So the outcome of this
>>>> patch is that compatible caches of different memcgs can be merged
>>>> together? Sorry if this is a stupid question but I am not that familiar
>>>> with this area much I am just seeing that cache_match_memcg goes away
>>>> and my understanding of the function is that it should prevent from
>>>> different memcg's caches merging.
>>> Let me try to explain how I understand it.
>>>
>>> What is cache merging/aliasing? When we create a cache
>>> (kmem_cache_create()), we first try to find a compatible cache that
>>> already exists and can handle requests from the new cache. If it is, we
>>> do not create any new caches, instead we simply increment the old cache
>>> refcount and return it.
>>>
>>> What about memcgs? Currently, it operates in the same way, i.e. on memcg
>>> cache creation we also try to find a compatible cache of the same memcg
>>> first. But if there were such a cache, they parents would have been
>>> merged (i.e. it would be the same cache). That means we would not even
>>> get to this memcg cache creation, because it already exists. That's why
>>> the code handling memcg caches merging seems pointless to me.
>>>
>> IIRC, this may not always hold. Some of the properties are configurable via
>> sysfs, and it might be that you haven't merged two parent caches because they
>> properties differ, but would be fine merging the child caches.
>>
>> If all properties we check are compile-time parameters, then it should be 
>> okay.
>
> AFAIK, we decide if a cache should be merged only basing on its internal
> parameters, such as size, ctor, flags, align (see find_mergeable()), but
> they are the same for root and memcg caches.
>
> The only way to disable slub merging is via the "slub_nomerge" kernel
> parameter, so it is impossible to get a situation when parents can not
> be merged, while children can.
>
> The only point of concern may be so called boot caches
> (create_boot_cache()), which are forcefully not allowed to be merged by
> setting refcount = -1. There are actually only two of them kmem_cache
> and kmem_cache_node used for internal slub allocations. I guess it was
> done preliminary, and we should not merge them for memcgs neither.
>
> To sum it up, if a particular root cache is allowed to be merged, it was
> allowed to be merged since its creation and all its children caches are
> also allowed to be merged. If merging was not allowed for a root cache
> when it was created, we should not merge its children caches.
>
> Thanks.

Fair Enough.


-- 
E Mare, Libertas
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 3/8] memcg, slab: never try to merge memcg caches

2014-02-04 Thread Glauber Costa

On Tue, Feb 4, 2014 at 7:27 PM, Vladimir Davydov  wrote:
> On 02/04/2014 07:11 PM, Michal Hocko wrote:
>> On Tue 04-02-14 18:59:23, Vladimir Davydov wrote:
>>> On 02/04/2014 06:52 PM, Michal Hocko wrote:
 On Sun 02-02-14 20:33:48, Vladimir Davydov wrote:
> Suppose we are creating memcg cache A that could be merged with cache B
> of the same memcg. Since any memcg cache has the same parameters as its
> parent cache, parent caches PA and PB of memcg caches A and B must be
> mergeable too. That means PA was merged with PB on creation or vice
> versa, i.e. PA = PB. From that it follows that A = B, and we couldn't
> even try to create cache B, because it already exists - a contradiction.
 I cannot tell I understand the above but I am totally not sure about the
 statement bellow.

> So let's remove unused code responsible for merging memcg caches.
 How come the code was unused? find_mergeable called cache_match_memcg...
>>> Oh, sorry for misleading comment. I mean the code handling merging of
>>> per-memcg caches is useless, AFAIU: if we find an alias for a per-memcg
>>> cache on kmem_cache_create_memcg(), the parent of the found alias must
>>> be the same as the parent_cache passed to kmem_cache_create_memcg(), but
>>> if it were so, we would never proceed to the memcg cache creation,
>>> because the cache we want to create already exists.
>> I am still not sure I understand this correctly. So the outcome of this
>> patch is that compatible caches of different memcgs can be merged
>> together? Sorry if this is a stupid question but I am not that familiar
>> with this area much I am just seeing that cache_match_memcg goes away
>> and my understanding of the function is that it should prevent from
>> different memcg's caches merging.
>
> Let me try to explain how I understand it.
>
> What is cache merging/aliasing? When we create a cache
> (kmem_cache_create()), we first try to find a compatible cache that
> already exists and can handle requests from the new cache. If it is, we
> do not create any new caches, instead we simply increment the old cache
> refcount and return it.
>
> What about memcgs? Currently, it operates in the same way, i.e. on memcg
> cache creation we also try to find a compatible cache of the same memcg
> first. But if there were such a cache, they parents would have been
> merged (i.e. it would be the same cache). That means we would not even
> get to this memcg cache creation, because it already exists. That's why
> the code handling memcg caches merging seems pointless to me.
>

IIRC, this may not always hold. Some of the properties are configurable via
sysfs, and it might be that you haven't merged two parent caches because they
properties differ, but would be fine merging the child caches.

If all properties we check are compile-time parameters, then it should be okay.


-- 
E Mare, Libertas
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 3/8] memcg, slab: never try to merge memcg caches

2014-02-04 Thread Glauber Costa

On Tue, Feb 4, 2014 at 7:27 PM, Vladimir Davydov vdavy...@parallels.com wrote:
 On 02/04/2014 07:11 PM, Michal Hocko wrote:
 On Tue 04-02-14 18:59:23, Vladimir Davydov wrote:
 On 02/04/2014 06:52 PM, Michal Hocko wrote:
 On Sun 02-02-14 20:33:48, Vladimir Davydov wrote:
 Suppose we are creating memcg cache A that could be merged with cache B
 of the same memcg. Since any memcg cache has the same parameters as its
 parent cache, parent caches PA and PB of memcg caches A and B must be
 mergeable too. That means PA was merged with PB on creation or vice
 versa, i.e. PA = PB. From that it follows that A = B, and we couldn't
 even try to create cache B, because it already exists - a contradiction.
 I cannot tell I understand the above but I am totally not sure about the
 statement bellow.

 So let's remove unused code responsible for merging memcg caches.
 How come the code was unused? find_mergeable called cache_match_memcg...
 Oh, sorry for misleading comment. I mean the code handling merging of
 per-memcg caches is useless, AFAIU: if we find an alias for a per-memcg
 cache on kmem_cache_create_memcg(), the parent of the found alias must
 be the same as the parent_cache passed to kmem_cache_create_memcg(), but
 if it were so, we would never proceed to the memcg cache creation,
 because the cache we want to create already exists.
 I am still not sure I understand this correctly. So the outcome of this
 patch is that compatible caches of different memcgs can be merged
 together? Sorry if this is a stupid question but I am not that familiar
 with this area much I am just seeing that cache_match_memcg goes away
 and my understanding of the function is that it should prevent from
 different memcg's caches merging.

 Let me try to explain how I understand it.

 What is cache merging/aliasing? When we create a cache
 (kmem_cache_create()), we first try to find a compatible cache that
 already exists and can handle requests from the new cache. If it is, we
 do not create any new caches, instead we simply increment the old cache
 refcount and return it.

 What about memcgs? Currently, it operates in the same way, i.e. on memcg
 cache creation we also try to find a compatible cache of the same memcg
 first. But if there were such a cache, they parents would have been
 merged (i.e. it would be the same cache). That means we would not even
 get to this memcg cache creation, because it already exists. That's why
 the code handling memcg caches merging seems pointless to me.


IIRC, this may not always hold. Some of the properties are configurable via
sysfs, and it might be that you haven't merged two parent caches because they
properties differ, but would be fine merging the child caches.

If all properties we check are compile-time parameters, then it should be okay.


-- 
E Mare, Libertas
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 3/8] memcg, slab: never try to merge memcg caches

2014-02-04 Thread Glauber Costa

On Tue, Feb 4, 2014 at 8:04 PM, Vladimir Davydov vdavy...@parallels.com wrote:
 On 02/04/2014 07:43 PM, Glauber Costa wrote:
 On Tue, Feb 4, 2014 at 7:27 PM, Vladimir Davydov vdavy...@parallels.com 
 wrote:
 On 02/04/2014 07:11 PM, Michal Hocko wrote:
 On Tue 04-02-14 18:59:23, Vladimir Davydov wrote:
 On 02/04/2014 06:52 PM, Michal Hocko wrote:
 On Sun 02-02-14 20:33:48, Vladimir Davydov wrote:
 Suppose we are creating memcg cache A that could be merged with cache B
 of the same memcg. Since any memcg cache has the same parameters as its
 parent cache, parent caches PA and PB of memcg caches A and B must be
 mergeable too. That means PA was merged with PB on creation or vice
 versa, i.e. PA = PB. From that it follows that A = B, and we couldn't
 even try to create cache B, because it already exists - a contradiction.
 I cannot tell I understand the above but I am totally not sure about the
 statement bellow.

 So let's remove unused code responsible for merging memcg caches.
 How come the code was unused? find_mergeable called cache_match_memcg...
 Oh, sorry for misleading comment. I mean the code handling merging of
 per-memcg caches is useless, AFAIU: if we find an alias for a per-memcg
 cache on kmem_cache_create_memcg(), the parent of the found alias must
 be the same as the parent_cache passed to kmem_cache_create_memcg(), but
 if it were so, we would never proceed to the memcg cache creation,
 because the cache we want to create already exists.
 I am still not sure I understand this correctly. So the outcome of this
 patch is that compatible caches of different memcgs can be merged
 together? Sorry if this is a stupid question but I am not that familiar
 with this area much I am just seeing that cache_match_memcg goes away
 and my understanding of the function is that it should prevent from
 different memcg's caches merging.
 Let me try to explain how I understand it.

 What is cache merging/aliasing? When we create a cache
 (kmem_cache_create()), we first try to find a compatible cache that
 already exists and can handle requests from the new cache. If it is, we
 do not create any new caches, instead we simply increment the old cache
 refcount and return it.

 What about memcgs? Currently, it operates in the same way, i.e. on memcg
 cache creation we also try to find a compatible cache of the same memcg
 first. But if there were such a cache, they parents would have been
 merged (i.e. it would be the same cache). That means we would not even
 get to this memcg cache creation, because it already exists. That's why
 the code handling memcg caches merging seems pointless to me.

 IIRC, this may not always hold. Some of the properties are configurable via
 sysfs, and it might be that you haven't merged two parent caches because they
 properties differ, but would be fine merging the child caches.

 If all properties we check are compile-time parameters, then it should be 
 okay.

 AFAIK, we decide if a cache should be merged only basing on its internal
 parameters, such as size, ctor, flags, align (see find_mergeable()), but
 they are the same for root and memcg caches.

 The only way to disable slub merging is via the slub_nomerge kernel
 parameter, so it is impossible to get a situation when parents can not
 be merged, while children can.

 The only point of concern may be so called boot caches
 (create_boot_cache()), which are forcefully not allowed to be merged by
 setting refcount = -1. There are actually only two of them kmem_cache
 and kmem_cache_node used for internal slub allocations. I guess it was
 done preliminary, and we should not merge them for memcgs neither.

 To sum it up, if a particular root cache is allowed to be merged, it was
 allowed to be merged since its creation and all its children caches are
 also allowed to be merged. If merging was not allowed for a root cache
 when it was created, we should not merge its children caches.

 Thanks.

Fair Enough.


-- 
E Mare, Libertas
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/8] memcg: export kmemcg cache id via cgroup fs

2014-02-03 Thread Glauber Costa

On Mon, Feb 3, 2014 at 10:57 AM, Vladimir Davydov
 wrote:
> On 02/03/2014 10:21 AM, David Rientjes wrote:
>> On Sun, 2 Feb 2014, Vladimir Davydov wrote:
>>
>>> Per-memcg kmem caches are named as follows:
>>>
>>>   (:)
>>>
>>> where  is the unique id of the memcg the cache belongs
>>> to,  is the relative name of the memcg on the cgroup fs.
>>> Cache names are exposed to userspace for debugging purposes (e.g. via
>>> sysfs in case of slub or via dmesg).
>>>
>>> Using relative names makes it impossible in general (in case the cgroup
>>> hierarchy is not flat) to find out which memcg a particular cache
>>> belongs to, because  is not known to the user. Since
>>> using absolute cgroup names would be an overkill, let's fix this by
>>> exporting the id of kmem-active memcg via cgroup fs file
>>> "memory.kmem.id".
>>>
>> Hmm, I'm not sure exporting additional information is the best way to do
>> it only for this purpose.  I do understand the problem in naming
>> collisions if the hierarchy isn't flat and we typically work around that
>> by ensuring child memcgs still have a unique memcg.  This isn't only a
>> problem in slab cache naming, me also avoid printing the entire absolute
>> names for things like the oom killer.
>
> AFAIU, cgroup identifiers dumped on oom (cgroup paths, currently) and
> memcg slab cache names serve for different purposes. The point is oom is
> a perfectly normal situation for the kernel, and info dumped to dmesg is
> for admin to find out the cause of the problem (a greedy user or
> cgroup). On the other hand, slab cache names are dumped to dmesg only on
> extraordinary situations - like bugs in slab implementation, or double
> free, or detected memory leaks - where we usually do not need the name
> of the memcg that triggered the problem, because the bug is likely to be
> in the kernel subsys using the cache. Plus, the names are exported to
> sysfs in case of slub, again for debugging purposes, AFAIK. So IMO the
> use cases for oom vs slab names are completely different - information
> vs debugging - and I want to export kmem.id only for the ability of
> debugging kmemcg and slab subsystems.
>

Then maybe it is better to wrap it into some kind of CONFIG_DEBUG wrap.
We already have other files like that.

>> So it would be nice to have
>> consensus on how people are supposed to identify memcgs with a hierarchy:
>> either by exporting information like the id like you do here (but leave
>> the oom killer still problematic) or by insisting people name their memcgs
>> with unique names if they care to differentiate them.
>
> Anyway, I agree with you that this needs a consensus, because this is a
> functional change.
>
> Thanks.



-- 
E Mare, Libertas
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/8] memcg: export kmemcg cache id via cgroup fs

2014-02-03 Thread Glauber Costa

On Mon, Feb 3, 2014 at 10:57 AM, Vladimir Davydov
vdavy...@parallels.com wrote:
 On 02/03/2014 10:21 AM, David Rientjes wrote:
 On Sun, 2 Feb 2014, Vladimir Davydov wrote:

 Per-memcg kmem caches are named as follows:

   global-cache-name(cgroup-kmem-id:cgroup-name)

 where cgroup-kmem-id is the unique id of the memcg the cache belongs
 to, cgroup-name is the relative name of the memcg on the cgroup fs.
 Cache names are exposed to userspace for debugging purposes (e.g. via
 sysfs in case of slub or via dmesg).

 Using relative names makes it impossible in general (in case the cgroup
 hierarchy is not flat) to find out which memcg a particular cache
 belongs to, because cgroup-kmem-id is not known to the user. Since
 using absolute cgroup names would be an overkill, let's fix this by
 exporting the id of kmem-active memcg via cgroup fs file
 memory.kmem.id.

 Hmm, I'm not sure exporting additional information is the best way to do
 it only for this purpose.  I do understand the problem in naming
 collisions if the hierarchy isn't flat and we typically work around that
 by ensuring child memcgs still have a unique memcg.  This isn't only a
 problem in slab cache naming, me also avoid printing the entire absolute
 names for things like the oom killer.

 AFAIU, cgroup identifiers dumped on oom (cgroup paths, currently) and
 memcg slab cache names serve for different purposes. The point is oom is
 a perfectly normal situation for the kernel, and info dumped to dmesg is
 for admin to find out the cause of the problem (a greedy user or
 cgroup). On the other hand, slab cache names are dumped to dmesg only on
 extraordinary situations - like bugs in slab implementation, or double
 free, or detected memory leaks - where we usually do not need the name
 of the memcg that triggered the problem, because the bug is likely to be
 in the kernel subsys using the cache. Plus, the names are exported to
 sysfs in case of slub, again for debugging purposes, AFAIK. So IMO the
 use cases for oom vs slab names are completely different - information
 vs debugging - and I want to export kmem.id only for the ability of
 debugging kmemcg and slab subsystems.


Then maybe it is better to wrap it into some kind of CONFIG_DEBUG wrap.
We already have other files like that.

 So it would be nice to have
 consensus on how people are supposed to identify memcgs with a hierarchy:
 either by exporting information like the id like you do here (but leave
 the oom killer still problematic) or by insisting people name their memcgs
 with unique names if they care to differentiate them.

 Anyway, I agree with you that this needs a consensus, because this is a
 functional change.

 Thanks.



-- 
E Mare, Libertas
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v14 16/18] vmpressure: in-kernel notifications

2013-12-20 Thread Glauber Costa

On Fri, Dec 20, 2013 at 8:53 PM, Luiz Capitulino  wrote:
> On Fri, 20 Dec 2013 20:46:05 +0400
> Glauber Costa  wrote:
>
>> On Fri, Dec 20, 2013 at 8:44 PM, Luiz Capitulino  
>> wrote:
>> > On Fri, 20 Dec 2013 10:03:32 -0500
>> > Luiz Capitulino  wrote:
>> >
>> >> > The answer for all of your questions above can be summarized by noting
>> >> > that for the lack of other users (at the time), this patch does the 
>> >> > bare minimum
>> >> > for memcg needs. I agree, for instance, that it would be good to pass 
>> >> > the level
>> >> > but since memcg won't do anything with thta, I didn't pass it.
>> >> >
>> >> > That should be extended if you need to.
>> >>
>> >> That works for me. That is, including this minimal version first and
>> >> extending it when we get in-tree users.
>> >
>> > Btw, there's something I was thinking just right now. If/when we
>> > convert shrink functions to use this API, they will come to depend
>> > on CONFIG_MEMCG=y. IOW, they won't work if CONFIG_MEMCG=n.
>> >
>> > Is this acceptable (this is an honest question)? Because today, they
>> > do work when CONFIG_MEMCG=n. Should those shrink functions use the
>> > shrinker API as a fallback?
>>
>> If you have a non-memcg user, that should obviously be available for
>> CONFIG_MEMCG=n
>
> OK, which means we'll have to change it, right? Because, if I'm not
> missing something, today vmpressure does depend on CONFIG_MEMCG=y.

You mean the main vmpressure mechanism?
Sorry, this was out of my mental cachelines. Yes, vmpressure depends
on MEMCG, because
the pressure interface is memcg-specific (global == root memcg)

You might want to change that so you can reuse the mechanism and let
only the user interface
depend on memcg.


-- 
E Mare, Libertas
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v14 16/18] vmpressure: in-kernel notifications

2013-12-20 Thread Glauber Costa

On Fri, Dec 20, 2013 at 8:44 PM, Luiz Capitulino  wrote:
> On Fri, 20 Dec 2013 10:03:32 -0500
> Luiz Capitulino  wrote:
>
>> > The answer for all of your questions above can be summarized by noting
>> > that for the lack of other users (at the time), this patch does the bare 
>> > minimum
>> > for memcg needs. I agree, for instance, that it would be good to pass the 
>> > level
>> > but since memcg won't do anything with thta, I didn't pass it.
>> >
>> > That should be extended if you need to.
>>
>> That works for me. That is, including this minimal version first and
>> extending it when we get in-tree users.
>
> Btw, there's something I was thinking just right now. If/when we
> convert shrink functions to use this API, they will come to depend
> on CONFIG_MEMCG=y. IOW, they won't work if CONFIG_MEMCG=n.
>
> Is this acceptable (this is an honest question)? Because today, they
> do work when CONFIG_MEMCG=n. Should those shrink functions use the
> shrinker API as a fallback?

If you have a non-memcg user, that should obviously be available for
CONFIG_MEMCG=n
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v14 16/18] vmpressure: in-kernel notifications

2013-12-20 Thread Glauber Costa

One correction:

>>  int vmpressure_register_kernel_event(struct cgroup_subsys_state *css,
>> - void (*fn)(void))
>> +void (*fn)(void *data, int level), void 
>> *data)
>>  {
>> -   struct vmpressure *vmpr = css_to_vmpressure(css);
>> +   struct vmpressure *vmpr;
>> struct vmpressure_event *ev;
>>
>> +   vmpr = css ? css_to_vmpressure(css) : memcg_to_vmpressure(NULL);
>> +

This looks like it could be improved. Better not to have that memcg
specific thing
here.

Other than that it makes sense.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v14 16/18] vmpressure: in-kernel notifications

2013-12-20 Thread Glauber Costa

> I have the exact problem described above for a project I'm working on
> and this solution seems to solve it well.
>
> However, I had a few issues while trying to use this interface. I'll
> comment on them below, but please take this more as advice seeking
> than patch review.
>
>> This patch extends that to also support in-kernel users. Events that
>> should be generated for in-kernel consumption will be marked as such,
>> and for those, we will call a registered function instead of triggering
>> an eventfd notification.
>>
>> Please note that due to my lack of understanding of each shrinker user,
>> I will stay away from converting the actual users, you are all welcome
>> to do so.
>>
>> Signed-off-by: Glauber Costa 
>> Signed-off-by: Vladimir Davydov 
>> Acked-by: Anton Vorontsov 
>> Acked-by: Pekka Enberg 
>> Reviewed-by: Greg Thelen 
>> Cc: Dave Chinner 
>> Cc: John Stultz 
>> Cc: Andrew Morton 
>> Cc: Joonsoo Kim 
>> Cc: Michal Hocko 
>> Cc: Kamezawa Hiroyuki 
>> Cc: Johannes Weiner 
>> ---
>>  include/linux/vmpressure.h |5 +
>>  mm/vmpressure.c|   53 
>> +---
>>  2 files changed, 55 insertions(+), 3 deletions(-)
>>
>> diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h
>> index 3f3788d..9102e53 100644
>> --- a/include/linux/vmpressure.h
>> +++ b/include/linux/vmpressure.h
>> @@ -19,6 +19,9 @@ struct vmpressure {
>>   /* Have to grab the lock on events traversal or modifications. */
>>   struct mutex events_lock;
>>
>> + /* False if only kernel users want to be notified, true otherwise. */
>> + bool notify_userspace;
>> +
>>   struct work_struct work;
>>  };
>>
>> @@ -38,6 +41,8 @@ extern int vmpressure_register_event(struct 
>> cgroup_subsys_state *css,
>>struct cftype *cft,
>>struct eventfd_ctx *eventfd,
>>const char *args);
>> +extern int vmpressure_register_kernel_event(struct cgroup_subsys_state *css,
>> + void (*fn)(void));
>>  extern void vmpressure_unregister_event(struct cgroup_subsys_state *css,
>>   struct cftype *cft,
>>   struct eventfd_ctx *eventfd);
>> diff --git a/mm/vmpressure.c b/mm/vmpressure.c
>> index e0f6283..730e7c1 100644
>> --- a/mm/vmpressure.c
>> +++ b/mm/vmpressure.c
>> @@ -130,8 +130,12 @@ static enum vmpressure_levels 
>> vmpressure_calc_level(unsigned long scanned,
>>  }
>>
>>  struct vmpressure_event {
>> - struct eventfd_ctx *efd;
>> + union {
>> + struct eventfd_ctx *efd;
>> + void (*fn)(void);
>
> How does the callback access its private data?
>
>> + };
>>   enum vmpressure_levels level;
>> + bool kernel_event;
>>   struct list_head node;
>>  };
>>
>> @@ -147,12 +151,15 @@ static bool vmpressure_event(struct vmpressure *vmpr,
>>   mutex_lock(>events_lock);
>>
>>   list_for_each_entry(ev, >events, node) {
>> - if (level >= ev->level) {
>> + if (ev->kernel_event) {
>> + ev->fn();
>
> I think it would be interesting to pass 'level' to the callback (I'll
> probably use it myself), but we could wait for a in-tree user before
> adding it.
>
>> + } else if (vmpr->notify_userspace && level >= ev->level) {
>>   eventfd_signal(ev->efd, 1);
>>   signalled = true;
>>   }
>>   }
>>
>> + vmpr->notify_userspace = false;
>>   mutex_unlock(>events_lock);
>>
>>   return signalled;
>> @@ -222,7 +229,7 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
>>* we account it too.
>>*/
>>   if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS)))
>> - return;
>> + goto schedule;
>>
>>   /*
>>* If we got here with no pages scanned, then that is an indicator
>> @@ -239,8 +246,15 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
>>   vmpr->scanned += scanned;
>>   vmpr->reclaimed += reclaimed;
>>   scanned = vmpr->scanned;
>> + /*
>> +  * If we didn't reach this point, only kernel events will be triggered.

Re: [PATCH v14 16/18] vmpressure: in-kernel notifications

2013-12-20 Thread Glauber Costa

 I have the exact problem described above for a project I'm working on
 and this solution seems to solve it well.

 However, I had a few issues while trying to use this interface. I'll
 comment on them below, but please take this more as advice seeking
 than patch review.

 This patch extends that to also support in-kernel users. Events that
 should be generated for in-kernel consumption will be marked as such,
 and for those, we will call a registered function instead of triggering
 an eventfd notification.

 Please note that due to my lack of understanding of each shrinker user,
 I will stay away from converting the actual users, you are all welcome
 to do so.

 Signed-off-by: Glauber Costa glom...@openvz.org
 Signed-off-by: Vladimir Davydov vdavy...@parallels.com
 Acked-by: Anton Vorontsov an...@enomsg.org
 Acked-by: Pekka Enberg penb...@kernel.org
 Reviewed-by: Greg Thelen gthe...@google.com
 Cc: Dave Chinner dchin...@redhat.com
 Cc: John Stultz john.stu...@linaro.org
 Cc: Andrew Morton a...@linux-foundation.org
 Cc: Joonsoo Kim iamjoonsoo@lge.com
 Cc: Michal Hocko mho...@suse.cz
 Cc: Kamezawa Hiroyuki kamezawa.hir...@jp.fujitsu.com
 Cc: Johannes Weiner han...@cmpxchg.org
 ---
  include/linux/vmpressure.h |5 +
  mm/vmpressure.c|   53 
 +---
  2 files changed, 55 insertions(+), 3 deletions(-)

 diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h
 index 3f3788d..9102e53 100644
 --- a/include/linux/vmpressure.h
 +++ b/include/linux/vmpressure.h
 @@ -19,6 +19,9 @@ struct vmpressure {
   /* Have to grab the lock on events traversal or modifications. */
   struct mutex events_lock;

 + /* False if only kernel users want to be notified, true otherwise. */
 + bool notify_userspace;
 +
   struct work_struct work;
  };

 @@ -38,6 +41,8 @@ extern int vmpressure_register_event(struct 
 cgroup_subsys_state *css,
struct cftype *cft,
struct eventfd_ctx *eventfd,
const char *args);
 +extern int vmpressure_register_kernel_event(struct cgroup_subsys_state *css,
 + void (*fn)(void));
  extern void vmpressure_unregister_event(struct cgroup_subsys_state *css,
   struct cftype *cft,
   struct eventfd_ctx *eventfd);
 diff --git a/mm/vmpressure.c b/mm/vmpressure.c
 index e0f6283..730e7c1 100644
 --- a/mm/vmpressure.c
 +++ b/mm/vmpressure.c
 @@ -130,8 +130,12 @@ static enum vmpressure_levels 
 vmpressure_calc_level(unsigned long scanned,
  }

  struct vmpressure_event {
 - struct eventfd_ctx *efd;
 + union {
 + struct eventfd_ctx *efd;
 + void (*fn)(void);

 How does the callback access its private data?

 + };
   enum vmpressure_levels level;
 + bool kernel_event;
   struct list_head node;
  };

 @@ -147,12 +151,15 @@ static bool vmpressure_event(struct vmpressure *vmpr,
   mutex_lock(vmpr-events_lock);

   list_for_each_entry(ev, vmpr-events, node) {
 - if (level = ev-level) {
 + if (ev-kernel_event) {
 + ev-fn();

 I think it would be interesting to pass 'level' to the callback (I'll
 probably use it myself), but we could wait for a in-tree user before
 adding it.

 + } else if (vmpr-notify_userspace  level = ev-level) {
   eventfd_signal(ev-efd, 1);
   signalled = true;
   }
   }

 + vmpr-notify_userspace = false;
   mutex_unlock(vmpr-events_lock);

   return signalled;
 @@ -222,7 +229,7 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
* we account it too.
*/
   if (!(gfp  (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS)))
 - return;
 + goto schedule;

   /*
* If we got here with no pages scanned, then that is an indicator
 @@ -239,8 +246,15 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
   vmpr-scanned += scanned;
   vmpr-reclaimed += reclaimed;
   scanned = vmpr-scanned;
 + /*
 +  * If we didn't reach this point, only kernel events will be triggered.
 +  * It is the job of the worker thread to clean this up once the
 +  * notifications are all delivered.
 +  */
 + vmpr-notify_userspace = true;
   spin_unlock(vmpr-sr_lock);

 +schedule:
   if (scanned  vmpressure_win)
   return;
   schedule_work(vmpr-work);
 @@ -324,6 +338,39 @@ int vmpressure_register_event(struct 
 cgroup_subsys_state *css,
  }

  /**
 + * vmpressure_register_kernel_event() - Register kernel-side notification
 + * @css: css that is interested in vmpressure notifications
 + * @fn:  function to be called when pressure happens
 + *
 + * This function register in-kernel users interested in receiving 
 notifications

Re: [PATCH v14 16/18] vmpressure: in-kernel notifications

2013-12-20 Thread Glauber Costa

One correction:

  int vmpressure_register_kernel_event(struct cgroup_subsys_state *css,
 - void (*fn)(void))
 +void (*fn)(void *data, int level), void 
 *data)
  {
 -   struct vmpressure *vmpr = css_to_vmpressure(css);
 +   struct vmpressure *vmpr;
 struct vmpressure_event *ev;

 +   vmpr = css ? css_to_vmpressure(css) : memcg_to_vmpressure(NULL);
 +

This looks like it could be improved. Better not to have that memcg
specific thing
here.

Other than that it makes sense.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v14 16/18] vmpressure: in-kernel notifications

2013-12-20 Thread Glauber Costa

On Fri, Dec 20, 2013 at 8:44 PM, Luiz Capitulino lcapitul...@redhat.com wrote:
 On Fri, 20 Dec 2013 10:03:32 -0500
 Luiz Capitulino lcapitul...@redhat.com wrote:

  The answer for all of your questions above can be summarized by noting
  that for the lack of other users (at the time), this patch does the bare 
  minimum
  for memcg needs. I agree, for instance, that it would be good to pass the 
  level
  but since memcg won't do anything with thta, I didn't pass it.
 
  That should be extended if you need to.

 That works for me. That is, including this minimal version first and
 extending it when we get in-tree users.

 Btw, there's something I was thinking just right now. If/when we
 convert shrink functions to use this API, they will come to depend
 on CONFIG_MEMCG=y. IOW, they won't work if CONFIG_MEMCG=n.

 Is this acceptable (this is an honest question)? Because today, they
 do work when CONFIG_MEMCG=n. Should those shrink functions use the
 shrinker API as a fallback?

If you have a non-memcg user, that should obviously be available for
CONFIG_MEMCG=n
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v14 16/18] vmpressure: in-kernel notifications

2013-12-20 Thread Glauber Costa

On Fri, Dec 20, 2013 at 8:53 PM, Luiz Capitulino lcapitul...@redhat.com wrote:
 On Fri, 20 Dec 2013 20:46:05 +0400
 Glauber Costa glom...@gmail.com wrote:

 On Fri, Dec 20, 2013 at 8:44 PM, Luiz Capitulino lcapitul...@redhat.com 
 wrote:
  On Fri, 20 Dec 2013 10:03:32 -0500
  Luiz Capitulino lcapitul...@redhat.com wrote:
 
   The answer for all of your questions above can be summarized by noting
   that for the lack of other users (at the time), this patch does the 
   bare minimum
   for memcg needs. I agree, for instance, that it would be good to pass 
   the level
   but since memcg won't do anything with thta, I didn't pass it.
  
   That should be extended if you need to.
 
  That works for me. That is, including this minimal version first and
  extending it when we get in-tree users.
 
  Btw, there's something I was thinking just right now. If/when we
  convert shrink functions to use this API, they will come to depend
  on CONFIG_MEMCG=y. IOW, they won't work if CONFIG_MEMCG=n.
 
  Is this acceptable (this is an honest question)? Because today, they
  do work when CONFIG_MEMCG=n. Should those shrink functions use the
  shrinker API as a fallback?

 If you have a non-memcg user, that should obviously be available for
 CONFIG_MEMCG=n

 OK, which means we'll have to change it, right? Because, if I'm not
 missing something, today vmpressure does depend on CONFIG_MEMCG=y.

You mean the main vmpressure mechanism?
Sorry, this was out of my mental cachelines. Yes, vmpressure depends
on MEMCG, because
the pressure interface is memcg-specific (global == root memcg)

You might want to change that so you can reuse the mechanism and let
only the user interface
depend on memcg.


-- 
E Mare, Libertas
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 4/6] memcg, slab: check and init memcg_cahes under slab_mutex

2013-12-19 Thread Glauber Costa

On Thu, Dec 19, 2013 at 11:07 AM, Vladimir Davydov
 wrote:
> On 12/18/2013 09:41 PM, Michal Hocko wrote:
>> On Wed 18-12-13 17:16:55, Vladimir Davydov wrote:
>>> The memcg_params::memcg_caches array can be updated concurrently from
>>> memcg_update_cache_size() and memcg_create_kmem_cache(). Although both
>>> of these functions take the slab_mutex during their operation, the
>>> latter checks if memcg's cache has already been allocated w/o taking the
>>> mutex. This can result in a race as described below.
>>>
>>> Asume two threads schedule kmem_cache creation works for the same
>>> kmem_cache of the same memcg from __memcg_kmem_get_cache(). One of the
>>> works successfully creates it. Another work should fail then, but if it
>>> interleaves with memcg_update_cache_size() as follows, it does not:
>> I am not sure I understand the race. memcg_update_cache_size is called
>> when we start accounting a new memcg or a child is created and it
>> inherits accounting from the parent. memcg_create_kmem_cache is called
>> when a new cache is first allocated from, right?
>
> memcg_update_cache_size() is called when kmem accounting is activated
> for a memcg, no matter how.
>
> memcg_create_kmem_cache() is scheduled from __memcg_kmem_get_cache().
> It's OK to have a bunch of such methods trying to create the same memcg
> cache concurrently, but only one of them should succeed.
>
>> Why cannot we simply take slab_mutex inside memcg_create_kmem_cache?
>> it is running from the workqueue context so it should clash with other
>> locks.
>
> Hmm, Glauber's code never takes the slab_mutex inside memcontrol.c. I
> have always been wondering why, because it could simplify flow paths
> significantly (e.g. update_cache_sizes() -> update_all_caches() ->
> update_cache_size() - from memcontrol.c to slab_common.c and back again
> just to take the mutex).
>

Because that is a layering violation and exposes implementation
details of the slab to
the outside world. I agree this would make things a lot simpler, but
please check with Christoph
if this is acceptable before going forward.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 4/6] memcg, slab: check and init memcg_cahes under slab_mutex

2013-12-19 Thread Glauber Costa

On Thu, Dec 19, 2013 at 11:07 AM, Vladimir Davydov
vdavy...@parallels.com wrote:
 On 12/18/2013 09:41 PM, Michal Hocko wrote:
 On Wed 18-12-13 17:16:55, Vladimir Davydov wrote:
 The memcg_params::memcg_caches array can be updated concurrently from
 memcg_update_cache_size() and memcg_create_kmem_cache(). Although both
 of these functions take the slab_mutex during their operation, the
 latter checks if memcg's cache has already been allocated w/o taking the
 mutex. This can result in a race as described below.

 Asume two threads schedule kmem_cache creation works for the same
 kmem_cache of the same memcg from __memcg_kmem_get_cache(). One of the
 works successfully creates it. Another work should fail then, but if it
 interleaves with memcg_update_cache_size() as follows, it does not:
 I am not sure I understand the race. memcg_update_cache_size is called
 when we start accounting a new memcg or a child is created and it
 inherits accounting from the parent. memcg_create_kmem_cache is called
 when a new cache is first allocated from, right?

 memcg_update_cache_size() is called when kmem accounting is activated
 for a memcg, no matter how.

 memcg_create_kmem_cache() is scheduled from __memcg_kmem_get_cache().
 It's OK to have a bunch of such methods trying to create the same memcg
 cache concurrently, but only one of them should succeed.

 Why cannot we simply take slab_mutex inside memcg_create_kmem_cache?
 it is running from the workqueue context so it should clash with other
 locks.

 Hmm, Glauber's code never takes the slab_mutex inside memcontrol.c. I
 have always been wondering why, because it could simplify flow paths
 significantly (e.g. update_cache_sizes() - update_all_caches() -
 update_cache_size() - from memcontrol.c to slab_common.c and back again
 just to take the mutex).


Because that is a layering violation and exposes implementation
details of the slab to
the outside world. I agree this would make things a lot simpler, but
please check with Christoph
if this is acceptable before going forward.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/2] memcg: fix memcg_size() calculation

2013-12-16 Thread Glauber Costa

On Mon, Dec 16, 2013 at 8:47 PM, Michal Hocko  wrote:
> On Sat 14-12-13 12:15:33, Vladimir Davydov wrote:
>> The mem_cgroup structure contains nr_node_ids pointers to
>> mem_cgroup_per_node objects, not the objects themselves.
>
> Ouch! This is 2k per node which is wasted. What a shame I haven't
> noticed this back then when reviewing 45cf7ebd5a033 (memcg: reduce the
> size of struct memcg 244-fold)
>
IIRC, they weren't pointers back then. I think they were embedded in
the structure, and I let
them embedded.
My mind may be tricking me, but I think I recall that Johannes changed
them to pointers
in a later time. No ?

In any case, this is correct.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/2] memcg: fix memcg_size() calculation

2013-12-16 Thread Glauber Costa

On Mon, Dec 16, 2013 at 8:47 PM, Michal Hocko mho...@suse.cz wrote:
 On Sat 14-12-13 12:15:33, Vladimir Davydov wrote:
 The mem_cgroup structure contains nr_node_ids pointers to
 mem_cgroup_per_node objects, not the objects themselves.

 Ouch! This is 2k per node which is wasted. What a shame I haven't
 noticed this back then when reviewing 45cf7ebd5a033 (memcg: reduce the
 size of struct memcg 244-fold)

IIRC, they weren't pointers back then. I think they were embedded in
the structure, and I let
them embedded.
My mind may be tricking me, but I think I recall that Johannes changed
them to pointers
in a later time. No ?

In any case, this is correct.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v13 11/16] mm: list_lru: add per-memcg lists

2013-12-12 Thread Glauber Costa

> OK, as far as I can tell, this is introducing a per-node, per-memcg
> LRU lists. Is that correct?
>
> If so, then that is not what Glauber and I originally intended for
> memcg LRUs. per-node LRUs are expensive in terms of memory and cross
> multiplying them by the number of memcgs in a system was not a good
> use of memory.
>
> According to Glauber, most memcgs are small and typically confined
> to a single node or two by external means and therefore don't need the
> scalability numa aware LRUs provide. Hence the idea was that the
> memcg LRUs would just be a single LRU list, just like a non-numa
> aware list_lru instantiation. IOWs, this is the structure that we
> had decided on as the best compromise between memory usage,
> complexity and memcg awareness:
>
Sorry for jumping late into this particular e-mail.

I just wanted to point out that the reason I adopted such matrix in my
design was that
it actually uses less memory this way. My reasoning for this was
explained in the original
patch that I posted that contained that implementation.

This is because whenever an object would go on a memcg list, it *would
not* go on
the global list. Therefore, to keep information about nodes for global
reclaim, you
have to put them in node-lists.

memcg reclaim, however, would reclaim regardless of node information.

In global reclaim, the memcg lists would be scanned obeying the node structure
in the lists.

Because that has a fixed cost, it ends up using less memory that having a second
list pointer in the objects, which is something that scale with the
number of objects.
Not to mention, that cost would be incurred even with memcg not being in use,
which is something that we would like to avoid.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v13 11/16] mm: list_lru: add per-memcg lists

2013-12-12 Thread Glauber Costa

 OK, as far as I can tell, this is introducing a per-node, per-memcg
 LRU lists. Is that correct?

 If so, then that is not what Glauber and I originally intended for
 memcg LRUs. per-node LRUs are expensive in terms of memory and cross
 multiplying them by the number of memcgs in a system was not a good
 use of memory.

 According to Glauber, most memcgs are small and typically confined
 to a single node or two by external means and therefore don't need the
 scalability numa aware LRUs provide. Hence the idea was that the
 memcg LRUs would just be a single LRU list, just like a non-numa
 aware list_lru instantiation. IOWs, this is the structure that we
 had decided on as the best compromise between memory usage,
 complexity and memcg awareness:

Sorry for jumping late into this particular e-mail.

I just wanted to point out that the reason I adopted such matrix in my
design was that
it actually uses less memory this way. My reasoning for this was
explained in the original
patch that I posted that contained that implementation.

This is because whenever an object would go on a memcg list, it *would
not* go on
the global list. Therefore, to keep information about nodes for global
reclaim, you
have to put them in node-lists.

memcg reclaim, however, would reclaim regardless of node information.

In global reclaim, the memcg lists would be scanned obeying the node structure
in the lists.

Because that has a fixed cost, it ends up using less memory that having a second
list pointer in the objects, which is something that scale with the
number of objects.
Not to mention, that cost would be incurred even with memcg not being in use,
which is something that we would like to avoid.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: Race in memcg kmem?

2013-12-10 Thread Glauber Costa

On Tue, Dec 10, 2013 at 5:59 PM, Vladimir Davydov
 wrote:
> Hi,
>
> Looking through the per-memcg kmem_cache initialization code, I have a
> bad feeling that it is prone to a race. Before getting to fixing it, I'd
> like to ensure this race is not only in my imagination. Here it goes.
>
> We keep per-memcg kmem caches in the memcg_params field of each root
> cache. The memcg_params array is grown dynamically by
> memcg_update_cache_size(). I guess that if this function is executed
> concurrently with memcg_create_kmem_cache() we can get a race resulting
> in a memory leak.
>
Ok, let's see.

> -- memcg_create_kmem_cache(memcg, cachep) --
> creates a new kmem_cache corresponding to a memcg and assigns it to the
> root cache; called in the background - it is OK to have several such
> functions trying to create a cache for the same memcg concurrently, but
> only one of them should succeed.

Yes.

> @cachep is the root cache
> @memcg is the memcg we want to create a cache for.
>
> The function:
>
> A1) assures there is no cache corresponding to the memcg (if it is we
> have nothing to do):
> idx = memcg_cache_id(memcg);
> if (cachep->memcg_params[idx])
> goto out;
>
> A2) creates and assigns a new cache:
> new_cachep = kmem_cache_dup(memcg, cachep);
Please note this cannot proceed in parallel with memcg_update_cache_size(),
because they both take the slab mutex.

> // init new_cachep
> cachep->memcg_params->memcg_caches[idx] = new_cachep;
>
>
> -- memcg_update_cache_size(s, num_groups) --
> grows s->memcg_params to accomodate data for num_groups memcg's
> @s is the root cache whose memcg_params we want to grow
> @num_groups is the new number of kmem-active cgroups (defines the new
> size of memcg_params array).
>
> The function:
>
> B1) allocates and assigns a new cache:
> cur_params = s->memcg_params;
> s->memcg_params = kzalloc(size, GFP_KERNEL);
>
> B2) copies per-memcg cache ptrs from the old memcg_params array to the
> new one:
> for (i = 0; i < memcg_limited_groups_array_size; i++) {
> if (!cur_params->memcg_caches[i])
> continue;
> s->memcg_params->memcg_caches[i] =
> cur_params->memcg_caches[i];
> }
>
> B3) frees the old array:
> kfree(cur_params);
>
>
> Since these two functions do not share any mutexes, we can get the

They do share a mutex, the slab mutex.

> following race:
>
> Assume, by the time Cpu0 gets to memcg_create_kmem_cache(), the memcg
> cache has already been created by another thread, so this function
> should do nothing.
>
> Cpu0Cpu1
> 
> B1
> A1  we haven't initialized memcg_params yet so Cpu0 will
> proceed to A2 to alloc and assign a new cache
> A2
> B2  Cpu1 rewrites the memcg cache ptr set by Cpu0 at A2
> - a memory leak?
> B3
>
> I'd like to add that even if I'm right about the race, this is rather
> not critical, because memcg_update_cache_sizes() is called very rarely.
>
Every race is critical.

So, I am a bit lost by your description. Get back to me if I got anything wrong,
but I am think that the point that you're missing is that all heavy
slab operations
take the slab_mutex underneath, and that includes cache creation and update.


>
> BTW, it seems to me that the way we update memcg_params in
> memcg_update_cache_size() make cache_from_memcg_idx() prone to
> use-after-free:
>
>> static inline struct kmem_cache *
>> cache_from_memcg_idx(struct kmem_cache *s, int idx)
>> {
>> if (!s->memcg_params)
>> return NULL;
>> return s->memcg_params->memcg_caches[idx];
>> }
>
> This is equivalent to
>
> 1) struct memcg_cache_params *params = s->memcg_params;
> 2) return params->memcg_caches[idx];
>
> If memcg_update_cache_size() is executed between steps 1 and 2 on
> another CPU, at step 2 we will dereference memcg_params that has already
> been freed. This is very unlikely, but still possible. Perhaps, we
> should free old memcg params only after a sync_rcu()?
>

You seem to be right in this one. Indeed, if my mind does not betray
me, That is how I freed
the LRUs. (with synchronize_rcus).
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v13 13/16] vmscan: take at least one pass with shrinkers

2013-12-10 Thread Glauber Costa

On Tue, Dec 10, 2013 at 3:50 PM, Vladimir Davydov
 wrote:
> On 12/10/2013 08:18 AM, Dave Chinner wrote:
>> On Mon, Dec 09, 2013 at 12:05:54PM +0400, Vladimir Davydov wrote:
>>> From: Glauber Costa 
>>>
>>> In very low free kernel memory situations, it may be the case that we
>>> have less objects to free than our initial batch size. If this is the
>>> case, it is better to shrink those, and open space for the new workload
>>> then to keep them and fail the new allocations.
>>>
>>> In particular, we are concerned with the direct reclaim case for memcg.
>>> Although this same technique can be applied to other situations just as
>>> well, we will start conservative and apply it for that case, which is
>>> the one that matters the most.
>> This should be at the start of the series.
>
> Since Glauber wanted to introduce this only for memcg-reclaim first,
> this can't be at the start of the series, but I'll move it to go
> immediately after per-memcg shrinking core in the next iteration.
>
> Thanks.

So, the reason for that being memcg only, is that the reclaim for
small objects triggered
a bunch of XFS regressions (I am sure the regressions are general, but
I've tested them using
ZFS).

In theory they shouldn't, so we can try to make it global again, so
long as it comes together
with benchmarks demonstrating that it is a safe change.

I am not sure the filesystem people would benefit from that directly,
though. So it may not be worth the hassle...
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v13 14/16] vmpressure: in-kernel notifications

2013-12-10 Thread Glauber Costa

On Mon, Dec 9, 2013 at 12:05 PM, Vladimir Davydov
 wrote:
> From: Glauber Costa 
>
> During the past weeks, it became clear to us that the shrinker interface

It has been more than a few weeks by now =)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v13 05/16] vmscan: move call to shrink_slab() to shrink_zones()

2013-12-10 Thread Glauber Costa

On Mon, Dec 9, 2013 at 12:05 PM, Vladimir Davydov
 wrote:
> This reduces the indentation level of do_try_to_free_pages() and removes
> extra loop over all eligible zones counting the number of on-LRU pages.
>

Looks correct to me.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v13 00/16] kmemcg shrinkers

2013-12-10 Thread Glauber Costa

> Please note that in contrast to previous versions this patch-set implements
> slab shrinking only when we hit the user memory limit so that kmem allocations
> will still fail if we are below the user memory limit, but close to the kmem
> limit. This is, because the implementation of kmem-only reclaim was rather
> incomplete - we had to fail GFP_NOFS allocations since everything we could
> reclaim was only FS data. I will try to improve this and send in a separate
> patch-set, but currently it is only worthwhile setting the kmem limit to be
> greater than the user mem limit just to enable per-memcg slab accounting and
> reclaim.

That is unfortunate, but it makes sense as a first step.



-- 
E Mare, Libertas
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v13 04/16] memcg: move memcg_caches_array_size() function

2013-12-10 Thread Glauber Costa

On Mon, Dec 9, 2013 at 12:05 PM, Vladimir Davydov
 wrote:
> I need to move this up a bit, and I am doing in a separate patch just to
> reduce churn in the patch that needs it.
>
> Signed-off-by: Vladimir Davydov 
Reviewed-by: Glauber Costa 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: Race in memcg kmem?

2013-12-10 Thread Glauber Costa

On Tue, Dec 10, 2013 at 5:59 PM, Vladimir Davydov
vdavy...@parallels.com wrote:
 Hi,

 Looking through the per-memcg kmem_cache initialization code, I have a
 bad feeling that it is prone to a race. Before getting to fixing it, I'd
 like to ensure this race is not only in my imagination. Here it goes.

 We keep per-memcg kmem caches in the memcg_params field of each root
 cache. The memcg_params array is grown dynamically by
 memcg_update_cache_size(). I guess that if this function is executed
 concurrently with memcg_create_kmem_cache() we can get a race resulting
 in a memory leak.

Ok, let's see.

 -- memcg_create_kmem_cache(memcg, cachep) --
 creates a new kmem_cache corresponding to a memcg and assigns it to the
 root cache; called in the background - it is OK to have several such
 functions trying to create a cache for the same memcg concurrently, but
 only one of them should succeed.

Yes.

 @cachep is the root cache
 @memcg is the memcg we want to create a cache for.

 The function:

 A1) assures there is no cache corresponding to the memcg (if it is we
 have nothing to do):
 idx = memcg_cache_id(memcg);
 if (cachep-memcg_params[idx])
 goto out;

 A2) creates and assigns a new cache:
 new_cachep = kmem_cache_dup(memcg, cachep);
Please note this cannot proceed in parallel with memcg_update_cache_size(),
because they both take the slab mutex.

 // init new_cachep
 cachep-memcg_params-memcg_caches[idx] = new_cachep;


 -- memcg_update_cache_size(s, num_groups) --
 grows s-memcg_params to accomodate data for num_groups memcg's
 @s is the root cache whose memcg_params we want to grow
 @num_groups is the new number of kmem-active cgroups (defines the new
 size of memcg_params array).

 The function:

 B1) allocates and assigns a new cache:
 cur_params = s-memcg_params;
 s-memcg_params = kzalloc(size, GFP_KERNEL);

 B2) copies per-memcg cache ptrs from the old memcg_params array to the
 new one:
 for (i = 0; i  memcg_limited_groups_array_size; i++) {
 if (!cur_params-memcg_caches[i])
 continue;
 s-memcg_params-memcg_caches[i] =
 cur_params-memcg_caches[i];
 }

 B3) frees the old array:
 kfree(cur_params);


 Since these two functions do not share any mutexes, we can get the

They do share a mutex, the slab mutex.

 following race:

 Assume, by the time Cpu0 gets to memcg_create_kmem_cache(), the memcg
 cache has already been created by another thread, so this function
 should do nothing.

 Cpu0Cpu1
 
 B1
 A1  we haven't initialized memcg_params yet so Cpu0 will
 proceed to A2 to alloc and assign a new cache
 A2
 B2  Cpu1 rewrites the memcg cache ptr set by Cpu0 at A2
 - a memory leak?
 B3

 I'd like to add that even if I'm right about the race, this is rather
 not critical, because memcg_update_cache_sizes() is called very rarely.

Every race is critical.

So, I am a bit lost by your description. Get back to me if I got anything wrong,
but I am think that the point that you're missing is that all heavy
slab operations
take the slab_mutex underneath, and that includes cache creation and update.



 BTW, it seems to me that the way we update memcg_params in
 memcg_update_cache_size() make cache_from_memcg_idx() prone to
 use-after-free:

 static inline struct kmem_cache *
 cache_from_memcg_idx(struct kmem_cache *s, int idx)
 {
 if (!s-memcg_params)
 return NULL;
 return s-memcg_params-memcg_caches[idx];
 }

 This is equivalent to

 1) struct memcg_cache_params *params = s-memcg_params;
 2) return params-memcg_caches[idx];

 If memcg_update_cache_size() is executed between steps 1 and 2 on
 another CPU, at step 2 we will dereference memcg_params that has already
 been freed. This is very unlikely, but still possible. Perhaps, we
 should free old memcg params only after a sync_rcu()?


You seem to be right in this one. Indeed, if my mind does not betray
me, That is how I freed
the LRUs. (with synchronize_rcus).
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v13 04/16] memcg: move memcg_caches_array_size() function

2013-12-10 Thread Glauber Costa

On Mon, Dec 9, 2013 at 12:05 PM, Vladimir Davydov
vdavy...@parallels.com wrote:
 I need to move this up a bit, and I am doing in a separate patch just to
 reduce churn in the patch that needs it.

 Signed-off-by: Vladimir Davydov vdavy...@parallels.com
Reviewed-by: Glauber Costa glom...@openvz.org
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v13 00/16] kmemcg shrinkers

2013-12-10 Thread Glauber Costa

 Please note that in contrast to previous versions this patch-set implements
 slab shrinking only when we hit the user memory limit so that kmem allocations
 will still fail if we are below the user memory limit, but close to the kmem
 limit. This is, because the implementation of kmem-only reclaim was rather
 incomplete - we had to fail GFP_NOFS allocations since everything we could
 reclaim was only FS data. I will try to improve this and send in a separate
 patch-set, but currently it is only worthwhile setting the kmem limit to be
 greater than the user mem limit just to enable per-memcg slab accounting and
 reclaim.

That is unfortunate, but it makes sense as a first step.



-- 
E Mare, Libertas
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v13 05/16] vmscan: move call to shrink_slab() to shrink_zones()

2013-12-10 Thread Glauber Costa

On Mon, Dec 9, 2013 at 12:05 PM, Vladimir Davydov
vdavy...@parallels.com wrote:
 This reduces the indentation level of do_try_to_free_pages() and removes
 extra loop over all eligible zones counting the number of on-LRU pages.


Looks correct to me.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v13 14/16] vmpressure: in-kernel notifications

2013-12-10 Thread Glauber Costa

On Mon, Dec 9, 2013 at 12:05 PM, Vladimir Davydov
vdavy...@parallels.com wrote:
 From: Glauber Costa glom...@openvz.org

 During the past weeks, it became clear to us that the shrinker interface

It has been more than a few weeks by now =)
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v13 13/16] vmscan: take at least one pass with shrinkers

2013-12-10 Thread Glauber Costa

On Tue, Dec 10, 2013 at 3:50 PM, Vladimir Davydov
vdavy...@parallels.com wrote:
 On 12/10/2013 08:18 AM, Dave Chinner wrote:
 On Mon, Dec 09, 2013 at 12:05:54PM +0400, Vladimir Davydov wrote:
 From: Glauber Costa glom...@openvz.org

 In very low free kernel memory situations, it may be the case that we
 have less objects to free than our initial batch size. If this is the
 case, it is better to shrink those, and open space for the new workload
 then to keep them and fail the new allocations.

 In particular, we are concerned with the direct reclaim case for memcg.
 Although this same technique can be applied to other situations just as
 well, we will start conservative and apply it for that case, which is
 the one that matters the most.
 This should be at the start of the series.

 Since Glauber wanted to introduce this only for memcg-reclaim first,
 this can't be at the start of the series, but I'll move it to go
 immediately after per-memcg shrinking core in the next iteration.

 Thanks.

So, the reason for that being memcg only, is that the reclaim for
small objects triggered
a bunch of XFS regressions (I am sure the regressions are general, but
I've tested them using
ZFS).

In theory they shouldn't, so we can try to make it global again, so
long as it comes together
with benchmarks demonstrating that it is a safe change.

I am not sure the filesystem people would benefit from that directly,
though. So it may not be worth the hassle...
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] memcg: remove KMEM_ACCOUNTED_ACTIVATED

2013-12-04 Thread Glauber Costa

>>
>> Could you do something clever with just one flag? Probably yes. But I
>> doubt it would
>> be that much cleaner, this is just the way that patching sites work.
>
> Thank you for spending your time to listen to me.
>
Don't worry! I thank you for carrying this forward.

> Let me try to explain what is bothering me.
>
> We have two state bits for each memcg, 'active' and 'activated'. There
> are two scenarios where the bits can be modified:
>
> 1) The kmem limit is set on a memcg for the first time -
> memcg_update_kmem_limit(). Here we call memcg_update_cache_sizes(),
> which sets the 'activated' bit on success, then update static branching,
> then set the 'active' bit. All three actions are done atomically in
> respect to other tasks setting the limit due to the set_limit_mutex.
> After both bits are set, they never get cleared for the memcg.
>
So far so good. But again, note how you yourself describe it:
the cations are done atomically  *in respect to other tasks setting the limit*

But there are also tasks that are running its courses naturally and
just allocating
memory. For those, some call sites will be on, some will be off. We need to make
sure that *none* of them uses the patched site until *all* of them are patched.
This has nothing to do with updates, this is all about the readers.

> 2) When a subgroup of a kmem-active cgroup is created -
> memcg_propagate_kmem(). Here we copy kmem_account_flags from the parent,
> then increase static branching refcounter, then call
> memcg_update_cache_sizes() for the new memcg, which may clear the
> 'activated' bit on failure. After successful execution, the state bits
> never get cleared for the new memcg.
>
> In scenario 2 there is no need bothering about the flags setting order,
> because we don't have any tasks in the cgroup yet - the tasks can be
> moved in only after css_online finishes when we have both of the bits
> set and the static branching enabled. Actually, we already do not bother
> about it, because we have both bits set before the cgroup is fully
> initialized (memcg_update_cache_sizes() is called).
>
Yes, after the first cgroup is set none of that matters. But it is just easier
and less error prone just to follow the same path every time. As I have said,
if you can come up with a more clever way to deal with the problem above
that doesn't involve the double flag - and you can prove it works - I
am definitely
fine with it. But this is subtle code, and in the past - Michal can
attest this - we've
changed this being sure it would work just to see it explode in our faces.

So although I am willing to review every patch for correctness on that
front (I never
said I liked the 2-flags scheme...), unless you have a bug or real
problem on it,
I would advise against changing it if its just to make it more readable.

But again, don't take me too seriously on this. If you and Michal think you can
come up with something better, I'm all for it.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] memcg: remove KMEM_ACCOUNTED_ACTIVATED

2013-12-04 Thread Glauber Costa


 Could you do something clever with just one flag? Probably yes. But I
 doubt it would
 be that much cleaner, this is just the way that patching sites work.

 Thank you for spending your time to listen to me.

Don't worry! I thank you for carrying this forward.

 Let me try to explain what is bothering me.

 We have two state bits for each memcg, 'active' and 'activated'. There
 are two scenarios where the bits can be modified:

 1) The kmem limit is set on a memcg for the first time -
 memcg_update_kmem_limit(). Here we call memcg_update_cache_sizes(),
 which sets the 'activated' bit on success, then update static branching,
 then set the 'active' bit. All three actions are done atomically in
 respect to other tasks setting the limit due to the set_limit_mutex.
 After both bits are set, they never get cleared for the memcg.

So far so good. But again, note how you yourself describe it:
the cations are done atomically  *in respect to other tasks setting the limit*

But there are also tasks that are running its courses naturally and
just allocating
memory. For those, some call sites will be on, some will be off. We need to make
sure that *none* of them uses the patched site until *all* of them are patched.
This has nothing to do with updates, this is all about the readers.

 2) When a subgroup of a kmem-active cgroup is created -
 memcg_propagate_kmem(). Here we copy kmem_account_flags from the parent,
 then increase static branching refcounter, then call
 memcg_update_cache_sizes() for the new memcg, which may clear the
 'activated' bit on failure. After successful execution, the state bits
 never get cleared for the new memcg.

 In scenario 2 there is no need bothering about the flags setting order,
 because we don't have any tasks in the cgroup yet - the tasks can be
 moved in only after css_online finishes when we have both of the bits
 set and the static branching enabled. Actually, we already do not bother
 about it, because we have both bits set before the cgroup is fully
 initialized (memcg_update_cache_sizes() is called).

Yes, after the first cgroup is set none of that matters. But it is just easier
and less error prone just to follow the same path every time. As I have said,
if you can come up with a more clever way to deal with the problem above
that doesn't involve the double flag - and you can prove it works - I
am definitely
fine with it. But this is subtle code, and in the past - Michal can
attest this - we've
changed this being sure it would work just to see it explode in our faces.

So although I am willing to review every patch for correctness on that
front (I never
said I liked the 2-flags scheme...), unless you have a bug or real
problem on it,
I would advise against changing it if its just to make it more readable.

But again, don't take me too seriously on this. If you and Michal think you can
come up with something better, I'm all for it.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] memcg: remove KMEM_ACCOUNTED_ACTIVATED

2013-12-03 Thread Glauber Costa

>>> Hi, Glauber

Hi.

>
> In memcg_update_kmem_limit() we do the whole process of limit
> initialization under a mutex so the situation we need protection from in
> tcp_update_limit() is impossible. BTW once set, the 'activated' flag is
> never cleared and never checked alone, only along with the 'active'
> flag, that's why I doubt we need it at all.

The updates are protected by a mutex, but the readers are not, and should not.
So we can still be patching the readers, and the double-flag was
initially used so
we can make sure that both flags are only set after the static branches are in.

Note that one of the flags is set inside memcg_update_cache_sizes(). After that,
we call static_key_slow_inc(). At this point, someone whose code is
patched in could
start accounting, but it shouldn't - because not all sites are patched in.

So after the update is done, we set the other flag, and now everybody
will start going
through.

Could you do something clever with just one flag? Probably yes. But I
doubt it would
be that much cleaner, this is just the way that patching sites work.

-- 
E Mare, Libertas
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] memcg: remove KMEM_ACCOUNTED_ACTIVATED

2013-12-03 Thread Glauber Costa

 Hi, Glauber

Hi.


 In memcg_update_kmem_limit() we do the whole process of limit
 initialization under a mutex so the situation we need protection from in
 tcp_update_limit() is impossible. BTW once set, the 'activated' flag is
 never cleared and never checked alone, only along with the 'active'
 flag, that's why I doubt we need it at all.

The updates are protected by a mutex, but the readers are not, and should not.
So we can still be patching the readers, and the double-flag was
initially used so
we can make sure that both flags are only set after the static branches are in.

Note that one of the flags is set inside memcg_update_cache_sizes(). After that,
we call static_key_slow_inc(). At this point, someone whose code is
patched in could
start accounting, but it shouldn't - because not all sites are patched in.

So after the update is done, we set the other flag, and now everybody
will start going
through.

Could you do something clever with just one flag? Probably yes. But I
doubt it would
be that much cleaner, this is just the way that patching sites work.

-- 
E Mare, Libertas
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] memcg: remove KMEM_ACCOUNTED_ACTIVATED

2013-12-02 Thread Glauber Costa

On Mon, Dec 2, 2013 at 11:21 PM, Vladimir Davydov
 wrote:
> On 12/02/2013 10:26 PM, Glauber Costa wrote:
>>
>> On Mon, Dec 2, 2013 at 10:15 PM, Michal Hocko  wrote:
>>>
>>> [CCing Glauber - please do so in other posts for kmem related changes]
>>>
>>> On Mon 02-12-13 17:08:13, Vladimir Davydov wrote:
>>>>
>>>> The KMEM_ACCOUNTED_ACTIVATED was introduced by commit a8964b9b ("memcg:
>>>> use static branches when code not in use") in order to guarantee that
>>>> static_key_slow_inc(_kmem_enabled_key) will be called only once
>>>> for each memory cgroup when its kmem limit is set. The point is that at
>>>> that time the memcg_update_kmem_limit() function's workflow looked like
>>>> this:
>>>>
>>>>bool must_inc_static_branch = false;
>>>>
>>>>cgroup_lock();
>>>>mutex_lock(_limit_mutex);
>>>>if (!memcg->kmem_account_flags && val != RESOURCE_MAX) {
>>>>/* The kmem limit is set for the first time */
>>>>ret = res_counter_set_limit(>kmem, val);
>>>>
>>>>memcg_kmem_set_activated(memcg);
>>>>must_inc_static_branch = true;
>>>>} else
>>>>ret = res_counter_set_limit(>kmem, val);
>>>>mutex_unlock(_limit_mutex);
>>>>cgroup_unlock();
>>>>
>>>>if (must_inc_static_branch) {
>>>>/* We can't do this under cgroup_lock */
>>>>static_key_slow_inc(_kmem_enabled_key);
>>>>memcg_kmem_set_active(memcg);
>>>>}
>>>>
>>>> Today, we don't use cgroup_lock in memcg_update_kmem_limit(), and
>>>> static_key_slow_inc() is called under the set_limit_mutex, but the
>>>> leftover from the above-mentioned commit is still here. Let's remove it.
>>>
>>> OK, so I have looked there again and 692e89abd154b (memcg: increment
>>> static branch right after limit set) which went in after cgroup_mutex
>>> has been removed. It came along with the following comment.
>>>  /*
>>>   * setting the active bit after the inc will guarantee
>>> no one
>>>   * starts accounting before all call sites are patched
>>>   */
>>>
>>> This suggests that the flag is needed after all because we have
>>> to be sure that _all_ the places have to be patched. AFAIU
>>> memcg_kmem_newpage_charge might see the static key already patched so
>>> it would do a charge but memcg_kmem_commit_charge would still see it
>>> unpatched and so the charge won't be committed.
>>>
>>> Or am I missing something?
>>
>> You are correct. This flag is there due to the way we are using static
>> branches.
>> The patching of one call site is atomic, but the patching of all of
>> them are not.
>> Therefore we need to use a two-flag scheme to guarantee that in the first
>> time
>> we turn the static branches on, there will be a clear point after
>> which we're going
>> to start accounting.
>
>
> Hi, Glauber
>
> Sorry, but I don't understand why we need two flags. Isn't checking the flag
> set after all call sites have been patched (I mean KMEM_ACCOUNTED_ACTIVE)
> not enough?

Take a look at net/ipv4/tcp_memcontrol.c. There are comprehensive comments there
for a mechanism that basically achieves the same thing. The idea is
that one flag is used
at all times and means "it is enabled". The second flags is a one time
only flag to indicate
that the patching process is complete. With one flag it seems to work,
but it is racy.

-- 
E Mare, Libertas
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] memcg: remove KMEM_ACCOUNTED_ACTIVATED

2013-12-02 Thread Glauber Costa

On Mon, Dec 2, 2013 at 10:51 PM, Michal Hocko  wrote:
> On Mon 02-12-13 22:26:48, Glauber Costa wrote:
>> On Mon, Dec 2, 2013 at 10:15 PM, Michal Hocko  wrote:
>> > [CCing Glauber - please do so in other posts for kmem related changes]
>> >
>> > On Mon 02-12-13 17:08:13, Vladimir Davydov wrote:
>> >> The KMEM_ACCOUNTED_ACTIVATED was introduced by commit a8964b9b ("memcg:
>> >> use static branches when code not in use") in order to guarantee that
>> >> static_key_slow_inc(_kmem_enabled_key) will be called only once
>> >> for each memory cgroup when its kmem limit is set. The point is that at
>> >> that time the memcg_update_kmem_limit() function's workflow looked like
>> >> this:
>> >>
>> >>   bool must_inc_static_branch = false;
>> >>
>> >>   cgroup_lock();
>> >>   mutex_lock(_limit_mutex);
>> >>   if (!memcg->kmem_account_flags && val != RESOURCE_MAX) {
>> >>   /* The kmem limit is set for the first time */
>> >>   ret = res_counter_set_limit(>kmem, val);
>> >>
>> >>   memcg_kmem_set_activated(memcg);
>> >>   must_inc_static_branch = true;
>> >>   } else
>> >>   ret = res_counter_set_limit(>kmem, val);
>> >>   mutex_unlock(_limit_mutex);
>> >>   cgroup_unlock();
>> >>
>> >>   if (must_inc_static_branch) {
>> >>   /* We can't do this under cgroup_lock */
>> >>   static_key_slow_inc(_kmem_enabled_key);
>> >>   memcg_kmem_set_active(memcg);
>> >>   }
>> >>
>> >> Today, we don't use cgroup_lock in memcg_update_kmem_limit(), and
>> >> static_key_slow_inc() is called under the set_limit_mutex, but the
>> >> leftover from the above-mentioned commit is still here. Let's remove it.
>> >
>> > OK, so I have looked there again and 692e89abd154b (memcg: increment
>> > static branch right after limit set) which went in after cgroup_mutex
>> > has been removed. It came along with the following comment.
>> > /*
>> >  * setting the active bit after the inc will guarantee no 
>> > one
>> >  * starts accounting before all call sites are patched
>> >  */
>> >
>> > This suggests that the flag is needed after all because we have
>> > to be sure that _all_ the places have to be patched. AFAIU
>> > memcg_kmem_newpage_charge might see the static key already patched so
>> > it would do a charge but memcg_kmem_commit_charge would still see it
>> > unpatched and so the charge won't be committed.
>> >
>> > Or am I missing something?
>>
>> You are correct. This flag is there due to the way we are using static 
>> branches.
>> The patching of one call site is atomic, but the patching of all of
>> them are not.
>> Therefore we need to use a two-flag scheme to guarantee that in the first 
>> time
>> we turn the static branches on, there will be a clear point after
>> which we're going
>> to start accounting.
>
> So http://lkml.org/lkml/2013/11/27/314 is correct then, right?

It looks correct.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] memcg: remove KMEM_ACCOUNTED_ACTIVATED

2013-12-02 Thread Glauber Costa

On Mon, Dec 2, 2013 at 10:15 PM, Michal Hocko  wrote:
> [CCing Glauber - please do so in other posts for kmem related changes]
>
> On Mon 02-12-13 17:08:13, Vladimir Davydov wrote:
>> The KMEM_ACCOUNTED_ACTIVATED was introduced by commit a8964b9b ("memcg:
>> use static branches when code not in use") in order to guarantee that
>> static_key_slow_inc(_kmem_enabled_key) will be called only once
>> for each memory cgroup when its kmem limit is set. The point is that at
>> that time the memcg_update_kmem_limit() function's workflow looked like
>> this:
>>
>>   bool must_inc_static_branch = false;
>>
>>   cgroup_lock();
>>   mutex_lock(_limit_mutex);
>>   if (!memcg->kmem_account_flags && val != RESOURCE_MAX) {
>>   /* The kmem limit is set for the first time */
>>   ret = res_counter_set_limit(>kmem, val);
>>
>>   memcg_kmem_set_activated(memcg);
>>   must_inc_static_branch = true;
>>   } else
>>   ret = res_counter_set_limit(>kmem, val);
>>   mutex_unlock(_limit_mutex);
>>   cgroup_unlock();
>>
>>   if (must_inc_static_branch) {
>>   /* We can't do this under cgroup_lock */
>>   static_key_slow_inc(_kmem_enabled_key);
>>   memcg_kmem_set_active(memcg);
>>   }
>>
>> Today, we don't use cgroup_lock in memcg_update_kmem_limit(), and
>> static_key_slow_inc() is called under the set_limit_mutex, but the
>> leftover from the above-mentioned commit is still here. Let's remove it.
>
> OK, so I have looked there again and 692e89abd154b (memcg: increment
> static branch right after limit set) which went in after cgroup_mutex
> has been removed. It came along with the following comment.
> /*
>  * setting the active bit after the inc will guarantee no one
>  * starts accounting before all call sites are patched
>  */
>
> This suggests that the flag is needed after all because we have
> to be sure that _all_ the places have to be patched. AFAIU
> memcg_kmem_newpage_charge might see the static key already patched so
> it would do a charge but memcg_kmem_commit_charge would still see it
> unpatched and so the charge won't be committed.
>
> Or am I missing something?

You are correct. This flag is there due to the way we are using static branches.
The patching of one call site is atomic, but the patching of all of
them are not.
Therefore we need to use a two-flag scheme to guarantee that in the first time
we turn the static branches on, there will be a clear point after
which we're going
to start accounting.





-- 
E Mare, Libertas
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] memcg: remove KMEM_ACCOUNTED_ACTIVATED

2013-12-02 Thread Glauber Costa

On Mon, Dec 2, 2013 at 10:15 PM, Michal Hocko mho...@suse.cz wrote:
 [CCing Glauber - please do so in other posts for kmem related changes]

 On Mon 02-12-13 17:08:13, Vladimir Davydov wrote:
 The KMEM_ACCOUNTED_ACTIVATED was introduced by commit a8964b9b (memcg:
 use static branches when code not in use) in order to guarantee that
 static_key_slow_inc(memcg_kmem_enabled_key) will be called only once
 for each memory cgroup when its kmem limit is set. The point is that at
 that time the memcg_update_kmem_limit() function's workflow looked like
 this:

   bool must_inc_static_branch = false;

   cgroup_lock();
   mutex_lock(set_limit_mutex);
   if (!memcg-kmem_account_flags  val != RESOURCE_MAX) {
   /* The kmem limit is set for the first time */
   ret = res_counter_set_limit(memcg-kmem, val);

   memcg_kmem_set_activated(memcg);
   must_inc_static_branch = true;
   } else
   ret = res_counter_set_limit(memcg-kmem, val);
   mutex_unlock(set_limit_mutex);
   cgroup_unlock();

   if (must_inc_static_branch) {
   /* We can't do this under cgroup_lock */
   static_key_slow_inc(memcg_kmem_enabled_key);
   memcg_kmem_set_active(memcg);
   }

 Today, we don't use cgroup_lock in memcg_update_kmem_limit(), and
 static_key_slow_inc() is called under the set_limit_mutex, but the
 leftover from the above-mentioned commit is still here. Let's remove it.

 OK, so I have looked there again and 692e89abd154b (memcg: increment
 static branch right after limit set) which went in after cgroup_mutex
 has been removed. It came along with the following comment.
 /*
  * setting the active bit after the inc will guarantee no one
  * starts accounting before all call sites are patched
  */

 This suggests that the flag is needed after all because we have
 to be sure that _all_ the places have to be patched. AFAIU
 memcg_kmem_newpage_charge might see the static key already patched so
 it would do a charge but memcg_kmem_commit_charge would still see it
 unpatched and so the charge won't be committed.

 Or am I missing something?

You are correct. This flag is there due to the way we are using static branches.
The patching of one call site is atomic, but the patching of all of
them are not.
Therefore we need to use a two-flag scheme to guarantee that in the first time
we turn the static branches on, there will be a clear point after
which we're going
to start accounting.





-- 
E Mare, Libertas
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] memcg: remove KMEM_ACCOUNTED_ACTIVATED

2013-12-02 Thread Glauber Costa

On Mon, Dec 2, 2013 at 10:51 PM, Michal Hocko mho...@suse.cz wrote:
 On Mon 02-12-13 22:26:48, Glauber Costa wrote:
 On Mon, Dec 2, 2013 at 10:15 PM, Michal Hocko mho...@suse.cz wrote:
  [CCing Glauber - please do so in other posts for kmem related changes]
 
  On Mon 02-12-13 17:08:13, Vladimir Davydov wrote:
  The KMEM_ACCOUNTED_ACTIVATED was introduced by commit a8964b9b (memcg:
  use static branches when code not in use) in order to guarantee that
  static_key_slow_inc(memcg_kmem_enabled_key) will be called only once
  for each memory cgroup when its kmem limit is set. The point is that at
  that time the memcg_update_kmem_limit() function's workflow looked like
  this:
 
bool must_inc_static_branch = false;
 
cgroup_lock();
mutex_lock(set_limit_mutex);
if (!memcg-kmem_account_flags  val != RESOURCE_MAX) {
/* The kmem limit is set for the first time */
ret = res_counter_set_limit(memcg-kmem, val);
 
memcg_kmem_set_activated(memcg);
must_inc_static_branch = true;
} else
ret = res_counter_set_limit(memcg-kmem, val);
mutex_unlock(set_limit_mutex);
cgroup_unlock();
 
if (must_inc_static_branch) {
/* We can't do this under cgroup_lock */
static_key_slow_inc(memcg_kmem_enabled_key);
memcg_kmem_set_active(memcg);
}
 
  Today, we don't use cgroup_lock in memcg_update_kmem_limit(), and
  static_key_slow_inc() is called under the set_limit_mutex, but the
  leftover from the above-mentioned commit is still here. Let's remove it.
 
  OK, so I have looked there again and 692e89abd154b (memcg: increment
  static branch right after limit set) which went in after cgroup_mutex
  has been removed. It came along with the following comment.
  /*
   * setting the active bit after the inc will guarantee no 
  one
   * starts accounting before all call sites are patched
   */
 
  This suggests that the flag is needed after all because we have
  to be sure that _all_ the places have to be patched. AFAIU
  memcg_kmem_newpage_charge might see the static key already patched so
  it would do a charge but memcg_kmem_commit_charge would still see it
  unpatched and so the charge won't be committed.
 
  Or am I missing something?

 You are correct. This flag is there due to the way we are using static 
 branches.
 The patching of one call site is atomic, but the patching of all of
 them are not.
 Therefore we need to use a two-flag scheme to guarantee that in the first 
 time
 we turn the static branches on, there will be a clear point after
 which we're going
 to start accounting.

 So http://lkml.org/lkml/2013/11/27/314 is correct then, right?

It looks correct.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] memcg: remove KMEM_ACCOUNTED_ACTIVATED

2013-12-02 Thread Glauber Costa

On Mon, Dec 2, 2013 at 11:21 PM, Vladimir Davydov
vdavy...@parallels.com wrote:
 On 12/02/2013 10:26 PM, Glauber Costa wrote:

 On Mon, Dec 2, 2013 at 10:15 PM, Michal Hocko mho...@suse.cz wrote:

 [CCing Glauber - please do so in other posts for kmem related changes]

 On Mon 02-12-13 17:08:13, Vladimir Davydov wrote:

 The KMEM_ACCOUNTED_ACTIVATED was introduced by commit a8964b9b (memcg:
 use static branches when code not in use) in order to guarantee that
 static_key_slow_inc(memcg_kmem_enabled_key) will be called only once
 for each memory cgroup when its kmem limit is set. The point is that at
 that time the memcg_update_kmem_limit() function's workflow looked like
 this:

bool must_inc_static_branch = false;

cgroup_lock();
mutex_lock(set_limit_mutex);
if (!memcg-kmem_account_flags  val != RESOURCE_MAX) {
/* The kmem limit is set for the first time */
ret = res_counter_set_limit(memcg-kmem, val);

memcg_kmem_set_activated(memcg);
must_inc_static_branch = true;
} else
ret = res_counter_set_limit(memcg-kmem, val);
mutex_unlock(set_limit_mutex);
cgroup_unlock();

if (must_inc_static_branch) {
/* We can't do this under cgroup_lock */
static_key_slow_inc(memcg_kmem_enabled_key);
memcg_kmem_set_active(memcg);
}

 Today, we don't use cgroup_lock in memcg_update_kmem_limit(), and
 static_key_slow_inc() is called under the set_limit_mutex, but the
 leftover from the above-mentioned commit is still here. Let's remove it.

 OK, so I have looked there again and 692e89abd154b (memcg: increment
 static branch right after limit set) which went in after cgroup_mutex
 has been removed. It came along with the following comment.
  /*
   * setting the active bit after the inc will guarantee
 no one
   * starts accounting before all call sites are patched
   */

 This suggests that the flag is needed after all because we have
 to be sure that _all_ the places have to be patched. AFAIU
 memcg_kmem_newpage_charge might see the static key already patched so
 it would do a charge but memcg_kmem_commit_charge would still see it
 unpatched and so the charge won't be committed.

 Or am I missing something?

 You are correct. This flag is there due to the way we are using static
 branches.
 The patching of one call site is atomic, but the patching of all of
 them are not.
 Therefore we need to use a two-flag scheme to guarantee that in the first
 time
 we turn the static branches on, there will be a clear point after
 which we're going
 to start accounting.


 Hi, Glauber

 Sorry, but I don't understand why we need two flags. Isn't checking the flag
 set after all call sites have been patched (I mean KMEM_ACCOUNTED_ACTIVE)
 not enough?

Take a look at net/ipv4/tcp_memcontrol.c. There are comprehensive comments there
for a mechanism that basically achieves the same thing. The idea is
that one flag is used
at all times and means it is enabled. The second flags is a one time
only flag to indicate
that the patching process is complete. With one flag it seems to work,
but it is racy.

-- 
E Mare, Libertas
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: linux-next: slab shrinkers: BUG at mm/list_lru.c:92

2013-07-09 Thread Glauber Costa

On Tue, Jul 09, 2013 at 10:50:32AM -0700, Andrew Morton wrote:
> On Tue, 9 Jul 2013 21:32:51 +0400 Glauber Costa  wrote:
> 
> > > $ dmesg | grep "blocked for more than"
> > > [276962.652076] INFO: task xfs-data/sda9:930 blocked for more than 480 
> > > seconds.
> > > [276962.653097] INFO: task kworker/2:2:17823 blocked for more than 480 
> > > seconds.
> > > [276962.653940] INFO: task ld:14442 blocked for more than 480 seconds.
> > > [276962.654297] INFO: task ld:14962 blocked for more than 480 seconds.
> > > [277442.652123] INFO: task xfs-data/sda9:930 blocked for more than 480 
> > > seconds.
> > > [277442.653153] INFO: task kworker/2:2:17823 blocked for more than 480 
> > > seconds.
> > > [277442.653997] INFO: task ld:14442 blocked for more than 480 seconds.
> > > [277442.654353] INFO: task ld:14962 blocked for more than 480 seconds.
> > > [277922.652069] INFO: task xfs-data/sda9:930 blocked for more than 480 
> > > seconds.
> > > [277922.653089] INFO: task kworker/2:2:17823 blocked for more than 480 
> > > seconds.
> > > 
> > 
> > You seem to have switched to XFS. Dave posted a patch two days ago fixing 
> > some
> > missing conversions in the XFS side. AFAIK, Andrew hasn't yet picked the 
> > patch.
> 
> I can't find that patch.  Please resend?
> 
> There's also "list_lru: fix broken LRU_RETRY behaviour", which I
> assume we need?

Yes, we can either apply or stash that one - up to you.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: linux-next: slab shrinkers: BUG at mm/list_lru.c:92

2013-07-09 Thread Glauber Costa

On Mon, Jul 08, 2013 at 02:53:52PM +0200, Michal Hocko wrote:
> On Thu 04-07-13 18:36:43, Michal Hocko wrote:
> > On Wed 03-07-13 21:24:03, Dave Chinner wrote:
> > > On Tue, Jul 02, 2013 at 02:44:27PM +0200, Michal Hocko wrote:
> > > > On Tue 02-07-13 22:19:47, Dave Chinner wrote:
> > > > [...]
> > > > > Ok, so it's been leaked from a dispose list somehow. Thanks for the
> > > > > info, Michal, it's time to go look at the code
> > > > 
> > > > OK, just in case we will need it, I am keeping the machine in this state
> > > > for now. So we still can play with crash and check all the juicy
> > > > internals.
> > > 
> > > My current suspect is the LRU_RETRY code. I don't think what it is
> > > doing is at all valid - list_for_each_safe() is not safe if you drop
> > > the lock that protects the list. i.e. there is nothing that protects
> > > the stored next pointer from being removed from the list by someone
> > > else. Hence what I think is occurring is this:
> > > 
> > > 
> > > thread 1  thread 2
> > > lock(lru)
> > > list_for_each_safe(lru)   lock(lru)
> > >   isolate ..
> > > lock(i_lock)
> > > has buffers
> > >   __iget
> > >   unlock(i_lock)
> > >   unlock(lru)
> > >   .   (gets lru lock)
> > >   list_for_each_safe(lru)
> > > walks all the inodes
> > > finds inode being isolated by other thread
> > > isolate
> > >   i_count > 0
> > > list_del_init(i_lru)
> > > return LRU_REMOVED;
> > >  moves to next inode, inode that
> > >  other thread has stored as next
> > >  isolate
> > >i_state |= I_FREEING
> > >list_move(dispose_list)
> > >return LRU_REMOVED
> > >
> > >unlock(lru)
> > >   lock(lru)
> > >   return LRU_RETRY;
> > >   if (!first_pass)
> > > 
> > >   --nr_to_scan
> > >   (loop again using next, which has already been removed from the
> > >   LRU by the other thread!)
> > >   isolate
> > > lock(i_lock)
> > > if (i_state & ~I_REFERENCED)
> > >   list_del_init(i_lru)< inode is on dispose list!
> > >   < inode is now isolated, with I_FREEING set
> > >   return LRU_REMOVED;
> > > 
> > > That fits the corpse left on your machine, Michal. One thread has
> > > moved the inode to a dispose list, the other thread thinks it is
> > > still on the LRU and should be removed, and removes it.
> > > 
> > > This also explains the lru item count going negative - the same item
> > > is being removed from the lru twice. So it seems like all the
> > > problems you've been seeing are caused by this one problem
> > > 
> > > Patch below that should fix this.
> > 
> > Good news! The test was running since morning and it didn't hang nor
> > crashed. So this really looks like the right fix. It will run also
> > during weekend to be 100% sure. But I guess it is safe to say
> 
> Hmm, it seems I was too optimistic or we have yet another issue here (I
> guess the later is more probable).
> 
> The weekend testing got stuck as well. 
> 
> The dmesg shows there were some hung tasks:
> [275284.264312] start.sh (11025): dropped kernel caches: 3
> [276962.652076] INFO: task xfs-data/sda9:930 blocked for more than 480 
> seconds.
> [276962.652087] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables 
> this message.
> [276962.652093] xfs-data/sda9   D 88001ffb9cc8 0   930  2 
> 0x
> [276962.652102]  88003794d198 0046 8800325f4480 
> 
> [276962.652113]  88003794c010 00012dc0 00012dc0 
> 00012dc0
> [276962.652121]  00012dc0 88003794dfd8 88003794dfd8 
> 00012dc0
> [276962.652128] Call Trace:
> [276962.652151]  [] ? __blk_run_queue+0x32/0x40
> [276962.652160]  [] ? queue_unplugged+0x78/0xb0
> [276962.652171]  [] schedule+0x24/0x70
> [276962.652178]  [] io_schedule+0x9c/0xf0
> [276962.652187]  [] sleep_on_page+0x9/0x10
> [276962.652194]  [] __wait_on_bit+0x5a/0x90
> [276962.652200]  [] ? __lock_page+0x70/0x70
> [276962.652206]  [] wait_on_page_bit+0x6f/0x80
> [276962.652215]  [] ? autoremove_wake_function+0x40/0x40
> [276962.652224]  [] ? page_evictable+0x11/0x50
> [276962.652231]  [] shrink_page_list+0x503/0x790
> [276962.652239]  [] shrink_inactive_list+0x1bb/0x570
> [276962.652246]  [] ? shrink_active_list+0x29f/0x340
> [276962.652254]  [] shrink_lruvec+0xf9/0x330
> [276962.652262]  [] mem_cgroup_shrink_node_zone+0xda/0x140
> [276962.652274]  [] ? mem_cgroup_reclaimable+0x108/0x150
> [276962.652282]  []

Re: linux-next: slab shrinkers: BUG at mm/list_lru.c:92

2013-07-09 Thread Glauber Costa

On Mon, Jul 08, 2013 at 02:04:19PM -0700, Andrew Morton wrote:
> On Mon, 8 Jul 2013 14:53:52 +0200 Michal Hocko  wrote:
> 
> > > Good news! The test was running since morning and it didn't hang nor
> > > crashed. So this really looks like the right fix. It will run also
> > > during weekend to be 100% sure. But I guess it is safe to say
> > 
> > Hmm, it seems I was too optimistic or we have yet another issue here (I
> > guess the later is more probable).
> > 
> > The weekend testing got stuck as well. 
> > 
> > The dmesg shows there were some hung tasks:
> 
> That looks like the classic "we lost an IO completion" trace.
> 
> I think it would be prudent to defer these patches into 3.12.
Agree.

Will they still in -mm, or do I have to resend ?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: linux-next: slab shrinkers: BUG at mm/list_lru.c:92

2013-07-09 Thread Glauber Costa

On Mon, Jul 08, 2013 at 02:04:19PM -0700, Andrew Morton wrote:
 On Mon, 8 Jul 2013 14:53:52 +0200 Michal Hocko mho...@suse.cz wrote:
 
   Good news! The test was running since morning and it didn't hang nor
   crashed. So this really looks like the right fix. It will run also
   during weekend to be 100% sure. But I guess it is safe to say
  
  Hmm, it seems I was too optimistic or we have yet another issue here (I
  guess the later is more probable).
  
  The weekend testing got stuck as well. 
  
  The dmesg shows there were some hung tasks:
 
 That looks like the classic we lost an IO completion trace.
 
 I think it would be prudent to defer these patches into 3.12.
Agree.

Will they still in -mm, or do I have to resend ?
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: linux-next: slab shrinkers: BUG at mm/list_lru.c:92

2013-07-09 Thread Glauber Costa

On Mon, Jul 08, 2013 at 02:53:52PM +0200, Michal Hocko wrote:
 On Thu 04-07-13 18:36:43, Michal Hocko wrote:
  On Wed 03-07-13 21:24:03, Dave Chinner wrote:
   On Tue, Jul 02, 2013 at 02:44:27PM +0200, Michal Hocko wrote:
On Tue 02-07-13 22:19:47, Dave Chinner wrote:
[...]
 Ok, so it's been leaked from a dispose list somehow. Thanks for the
 info, Michal, it's time to go look at the code

OK, just in case we will need it, I am keeping the machine in this state
for now. So we still can play with crash and check all the juicy
internals.
   
   My current suspect is the LRU_RETRY code. I don't think what it is
   doing is at all valid - list_for_each_safe() is not safe if you drop
   the lock that protects the list. i.e. there is nothing that protects
   the stored next pointer from being removed from the list by someone
   else. Hence what I think is occurring is this:
   
   
   thread 1  thread 2
   lock(lru)
   list_for_each_safe(lru)   lock(lru)
 isolate ..
   lock(i_lock)
   has buffers
 __iget
 unlock(i_lock)
 unlock(lru)
 .   (gets lru lock)
 list_for_each_safe(lru)
   walks all the inodes
   finds inode being isolated by other thread
   isolate
 i_count  0
   list_del_init(i_lru)
   return LRU_REMOVED;
moves to next inode, inode that
other thread has stored as next
isolate
  i_state |= I_FREEING
  list_move(dispose_list)
  return LRU_REMOVED
  
  unlock(lru)
 lock(lru)
 return LRU_RETRY;
 if (!first_pass)
   
 --nr_to_scan
 (loop again using next, which has already been removed from the
 LRU by the other thread!)
 isolate
   lock(i_lock)
   if (i_state  ~I_REFERENCED)
 list_del_init(i_lru) inode is on dispose list!
  inode is now isolated, with I_FREEING set
 return LRU_REMOVED;
   
   That fits the corpse left on your machine, Michal. One thread has
   moved the inode to a dispose list, the other thread thinks it is
   still on the LRU and should be removed, and removes it.
   
   This also explains the lru item count going negative - the same item
   is being removed from the lru twice. So it seems like all the
   problems you've been seeing are caused by this one problem
   
   Patch below that should fix this.
  
  Good news! The test was running since morning and it didn't hang nor
  crashed. So this really looks like the right fix. It will run also
  during weekend to be 100% sure. But I guess it is safe to say
 
 Hmm, it seems I was too optimistic or we have yet another issue here (I
 guess the later is more probable).
 
 The weekend testing got stuck as well. 
 
 The dmesg shows there were some hung tasks:
 [275284.264312] start.sh (11025): dropped kernel caches: 3
 [276962.652076] INFO: task xfs-data/sda9:930 blocked for more than 480 
 seconds.
 [276962.652087] echo 0  /proc/sys/kernel/hung_task_timeout_secs disables 
 this message.
 [276962.652093] xfs-data/sda9   D 88001ffb9cc8 0   930  2 
 0x
 [276962.652102]  88003794d198 0046 8800325f4480 
 
 [276962.652113]  88003794c010 00012dc0 00012dc0 
 00012dc0
 [276962.652121]  00012dc0 88003794dfd8 88003794dfd8 
 00012dc0
 [276962.652128] Call Trace:
 [276962.652151]  [812a2c22] ? __blk_run_queue+0x32/0x40
 [276962.652160]  [812a31f8] ? queue_unplugged+0x78/0xb0
 [276962.652171]  [815793a4] schedule+0x24/0x70
 [276962.652178]  [8157948c] io_schedule+0x9c/0xf0
 [276962.652187]  [811011a9] sleep_on_page+0x9/0x10
 [276962.652194]  [815778ca] __wait_on_bit+0x5a/0x90
 [276962.652200]  [811011a0] ? __lock_page+0x70/0x70
 [276962.652206]  [8110150f] wait_on_page_bit+0x6f/0x80
 [276962.652215]  [81067190] ? autoremove_wake_function+0x40/0x40
 [276962.652224]  [81112ee1] ? page_evictable+0x11/0x50
 [276962.652231]  [81114e43] shrink_page_list+0x503/0x790
 [276962.652239]  [8111570b] shrink_inactive_list+0x1bb/0x570
 [276962.652246]  [81115d5f] ? shrink_active_list+0x29f/0x340
 [276962.652254]  [81115ef9] shrink_lruvec+0xf9/0x330
 [276962.652262]  [8111660a] mem_cgroup_shrink_node_zone+0xda/0x140
 [276962.652274]  [81160c28] ? mem_cgroup_reclaimable+0x108/0x150
 [276962.652282]  [81163382]

Re: linux-next: slab shrinkers: BUG at mm/list_lru.c:92

2013-07-09 Thread Glauber Costa

On Tue, Jul 09, 2013 at 10:50:32AM -0700, Andrew Morton wrote:
 On Tue, 9 Jul 2013 21:32:51 +0400 Glauber Costa glom...@gmail.com wrote:
 
   $ dmesg | grep blocked for more than
   [276962.652076] INFO: task xfs-data/sda9:930 blocked for more than 480 
   seconds.
   [276962.653097] INFO: task kworker/2:2:17823 blocked for more than 480 
   seconds.
   [276962.653940] INFO: task ld:14442 blocked for more than 480 seconds.
   [276962.654297] INFO: task ld:14962 blocked for more than 480 seconds.
   [277442.652123] INFO: task xfs-data/sda9:930 blocked for more than 480 
   seconds.
   [277442.653153] INFO: task kworker/2:2:17823 blocked for more than 480 
   seconds.
   [277442.653997] INFO: task ld:14442 blocked for more than 480 seconds.
   [277442.654353] INFO: task ld:14962 blocked for more than 480 seconds.
   [277922.652069] INFO: task xfs-data/sda9:930 blocked for more than 480 
   seconds.
   [277922.653089] INFO: task kworker/2:2:17823 blocked for more than 480 
   seconds.
   
  
  You seem to have switched to XFS. Dave posted a patch two days ago fixing 
  some
  missing conversions in the XFS side. AFAIK, Andrew hasn't yet picked the 
  patch.
 
 I can't find that patch.  Please resend?
 
 There's also list_lru: fix broken LRU_RETRY behaviour, which I
 assume we need?

Yes, we can either apply or stash that one - up to you.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: linux-next: slab shrinkers: BUG at mm/list_lru.c:92

2013-07-03 Thread Glauber Costa

On Wed, Jul 03, 2013 at 09:24:03PM +1000, Dave Chinner wrote:
> On Tue, Jul 02, 2013 at 02:44:27PM +0200, Michal Hocko wrote:
> > On Tue 02-07-13 22:19:47, Dave Chinner wrote:
> > [...]
> > > Ok, so it's been leaked from a dispose list somehow. Thanks for the
> > > info, Michal, it's time to go look at the code
> > 
> > OK, just in case we will need it, I am keeping the machine in this state
> > for now. So we still can play with crash and check all the juicy
> > internals.
> 
> My current suspect is the LRU_RETRY code. I don't think what it is
> doing is at all valid - list_for_each_safe() is not safe if you drop
> the lock that protects the list. i.e. there is nothing that protects
> the stored next pointer from being removed from the list by someone
> else. Hence what I think is occurring is this:
> 
> 
> thread 1  thread 2
> lock(lru)
> list_for_each_safe(lru)   lock(lru)
>   isolate ..
> lock(i_lock)
> has buffers
>   __iget
>   unlock(i_lock)
>   unlock(lru)
>   .   (gets lru lock)
>   list_for_each_safe(lru)
> walks all the inodes
> finds inode being isolated by other thread
> isolate
>   i_count > 0
> list_del_init(i_lru)
> return LRU_REMOVED;
>  moves to next inode, inode that
>  other thread has stored as next
>  isolate
>i_state |= I_FREEING
>list_move(dispose_list)
>return LRU_REMOVED
>
>unlock(lru)
>   lock(lru)
>   return LRU_RETRY;
>   if (!first_pass)
> 
>   --nr_to_scan
>   (loop again using next, which has already been removed from the
>   LRU by the other thread!)
>   isolate
> lock(i_lock)
> if (i_state & ~I_REFERENCED)
>   list_del_init(i_lru)< inode is on dispose list!
>   < inode is now isolated, with I_FREEING set
>   return LRU_REMOVED;
> 
> That fits the corpse left on your machine, Michal. One thread has
> moved the inode to a dispose list, the other thread thinks it is
> still on the LRU and should be removed, and removes it.
> 
> This also explains the lru item count going negative - the same item
> is being removed from the lru twice. So it seems like all the
> problems you've been seeing are caused by this one problem
> 
> Patch below that should fix this.
> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> da...@fromorbit.com
> 
> list_lru: fix broken LRU_RETRY behaviour
> 
> From: Dave Chinner 
> 
> The LRU_RETRY code assumes that the list traversal status after we
> have dropped and regained the list lock. Unfortunately, this is not
> a valid assumption, and that can lead to racing traversals isolating
> objects that the other traversal expects to be the next item on the
> list.
> 
> This is causing problems with the inode cache shrinker isolation,
> with races resulting in an inode on a dispose list being "isolated"
> because a racing traversal still thinks it is on the LRU. The inode
> is then never reclaimed and that causes hangs if a subsequent lookup
> on that inode occurs.
> 
> Fix it by always restarting the list walk on a LRU_RETRY return from
> the isolate callback. Avoid the possibility of livelocks the current
> code was trying to aavoid by always decrementing the nr_to_walk
> counter on retries so that even if we keep hitting the same item on
> the list we'll eventually stop trying to walk and exit out of the
> situation causing the problem.
> 
> Reported-by: Michal Hocko 
> Signed-off-by: Dave Chinner 
> ---
>  mm/list_lru.c |   29 -
>  1 file changed, 12 insertions(+), 17 deletions(-)
> 
> diff --git a/mm/list_lru.c b/mm/list_lru.c
> index dc71659..7246791 100644
> --- a/mm/list_lru.c
> +++ b/mm/list_lru.c
> @@ -71,19 +71,19 @@ list_lru_walk_node(struct list_lru *lru, int nid, 
> list_lru_walk_cb isolate,
>   struct list_lru_node*nlru = >node[nid];
>   struct list_head *item, *n;
>   unsigned long isolated = 0;
> - /*
> -  * If we don't keep state of at which pass we are, we can loop at
> -  * LRU_RETRY, since we have no guarantees that the caller will be able
> -  * to do something other than retry on the next pass. We handle this by
> -  * allowing at most one retry per object. This should not be altered
> -  * by any condition other than LRU_RETRY.
> -  */
> - bool first_pass = true;
>  
>   spin_lock(>lock);
>  restart:
>   list_for_each_safe(item, n, >list) {
>   enum lru_status ret;
> +
>

Re: linux-next: slab shrinkers: BUG at mm/list_lru.c:92

2013-07-03 Thread Glauber Costa

On Wed, Jul 03, 2013 at 09:24:03PM +1000, Dave Chinner wrote:
 On Tue, Jul 02, 2013 at 02:44:27PM +0200, Michal Hocko wrote:
  On Tue 02-07-13 22:19:47, Dave Chinner wrote:
  [...]
   Ok, so it's been leaked from a dispose list somehow. Thanks for the
   info, Michal, it's time to go look at the code
  
  OK, just in case we will need it, I am keeping the machine in this state
  for now. So we still can play with crash and check all the juicy
  internals.
 
 My current suspect is the LRU_RETRY code. I don't think what it is
 doing is at all valid - list_for_each_safe() is not safe if you drop
 the lock that protects the list. i.e. there is nothing that protects
 the stored next pointer from being removed from the list by someone
 else. Hence what I think is occurring is this:
 
 
 thread 1  thread 2
 lock(lru)
 list_for_each_safe(lru)   lock(lru)
   isolate ..
 lock(i_lock)
 has buffers
   __iget
   unlock(i_lock)
   unlock(lru)
   .   (gets lru lock)
   list_for_each_safe(lru)
 walks all the inodes
 finds inode being isolated by other thread
 isolate
   i_count  0
 list_del_init(i_lru)
 return LRU_REMOVED;
  moves to next inode, inode that
  other thread has stored as next
  isolate
i_state |= I_FREEING
list_move(dispose_list)
return LRU_REMOVED

unlock(lru)
   lock(lru)
   return LRU_RETRY;
   if (!first_pass)
 
   --nr_to_scan
   (loop again using next, which has already been removed from the
   LRU by the other thread!)
   isolate
 lock(i_lock)
 if (i_state  ~I_REFERENCED)
   list_del_init(i_lru) inode is on dispose list!
inode is now isolated, with I_FREEING set
   return LRU_REMOVED;
 
 That fits the corpse left on your machine, Michal. One thread has
 moved the inode to a dispose list, the other thread thinks it is
 still on the LRU and should be removed, and removes it.
 
 This also explains the lru item count going negative - the same item
 is being removed from the lru twice. So it seems like all the
 problems you've been seeing are caused by this one problem
 
 Patch below that should fix this.
 
 Cheers,
 
 Dave.
 -- 
 Dave Chinner
 da...@fromorbit.com
 
 list_lru: fix broken LRU_RETRY behaviour
 
 From: Dave Chinner dchin...@redhat.com
 
 The LRU_RETRY code assumes that the list traversal status after we
 have dropped and regained the list lock. Unfortunately, this is not
 a valid assumption, and that can lead to racing traversals isolating
 objects that the other traversal expects to be the next item on the
 list.
 
 This is causing problems with the inode cache shrinker isolation,
 with races resulting in an inode on a dispose list being isolated
 because a racing traversal still thinks it is on the LRU. The inode
 is then never reclaimed and that causes hangs if a subsequent lookup
 on that inode occurs.
 
 Fix it by always restarting the list walk on a LRU_RETRY return from
 the isolate callback. Avoid the possibility of livelocks the current
 code was trying to aavoid by always decrementing the nr_to_walk
 counter on retries so that even if we keep hitting the same item on
 the list we'll eventually stop trying to walk and exit out of the
 situation causing the problem.
 
 Reported-by: Michal Hocko mho...@suse.cz
 Signed-off-by: Dave Chinner dchin...@redhat.com
 ---
  mm/list_lru.c |   29 -
  1 file changed, 12 insertions(+), 17 deletions(-)
 
 diff --git a/mm/list_lru.c b/mm/list_lru.c
 index dc71659..7246791 100644
 --- a/mm/list_lru.c
 +++ b/mm/list_lru.c
 @@ -71,19 +71,19 @@ list_lru_walk_node(struct list_lru *lru, int nid, 
 list_lru_walk_cb isolate,
   struct list_lru_node*nlru = lru-node[nid];
   struct list_head *item, *n;
   unsigned long isolated = 0;
 - /*
 -  * If we don't keep state of at which pass we are, we can loop at
 -  * LRU_RETRY, since we have no guarantees that the caller will be able
 -  * to do something other than retry on the next pass. We handle this by
 -  * allowing at most one retry per object. This should not be altered
 -  * by any condition other than LRU_RETRY.
 -  */
 - bool first_pass = true;
  
   spin_lock(nlru-lock);
  restart:
   list_for_each_safe(item, n, nlru-list) {
   enum lru_status ret;
 +
 + /*
 +  * decrement nr_to_walk first so that we don't

Re: linux-next: slab shrinkers: BUG at mm/list_lru.c:92

2013-06-28 Thread Glauber Costa

On Fri, Jun 28, 2013 at 10:39:43AM +0200, Michal Hocko wrote:
> I have just triggered this one.
> 
> [37955.364062] RIP: 0010:[]  [] 
> list_lru_walk_node+0xab/0x140
> [37955.364062] RSP: :8800374af7b8  EFLAGS: 00010286
> [37955.364062] RAX: 0106 RBX: 88002ead7838 RCX: 
> 8800374af830
Note ebx

> [37955.364062] RDX: 0107 RSI: 88001d250dc0 RDI: 
> 88002ead77d0
> [37955.364062] RBP: 8800374af818 R08:  R09: 
> 88001ffeafc0
> [37955.364062] R10: 0002 R11:  R12: 
> 88001d250dc0
> [37955.364062] R13: 00a0 R14: 00572ead7838 R15: 
> 88001d250dc8
Note r14


> [37955.364062] Process as (pid: 3351, threadinfo 8800374ae000, task 
> 880036d665c0)
> [37955.364062] Stack:
> [37955.364062]  88001da3e700 8800374af830 8800374af838 
> 811846d0
> [37955.364062]   88001ce75c48 01ff8800374af838 
> 8800374af838
> [37955.364062]   88001ce75800 8800374afa08 
> 1014
> [37955.364062] Call Trace:
> [37955.364062]  [] ? insert_inode_locked+0x160/0x160
> [37955.364062]  [] prune_icache_sb+0x3c/0x60
> [37955.364062]  [] super_cache_scan+0x12e/0x1b0
> [37955.364062]  [] shrink_slab_node+0x13a/0x250
> [37955.364062]  [] shrink_slab+0xab/0x120
> [37955.364062]  [] do_try_to_free_pages+0x264/0x360
> [37955.364062]  [] try_to_free_pages+0x130/0x180
> [37955.364062]  [] ? __switch_to+0x1b4/0x550
> [37955.364062]  [] __alloc_pages_slowpath+0x39e/0x790
> [37955.364062]  [] __alloc_pages_nodemask+0x1fa/0x210
> [37955.364062]  [] alloc_pages_vma+0xa0/0x120
> [37955.364062]  [] do_anonymous_page+0x16b/0x350
> [37955.364062]  [] handle_pte_fault+0x235/0x240
> [37955.364062]  [] ? set_next_entity+0xb0/0xd0
> [37955.364062]  [] handle_mm_fault+0x2ef/0x400
> [37955.364062]  [] __do_page_fault+0x237/0x4f0
> [37955.364062]  [] ? fsnotify_access+0x68/0x80
> [37955.364062]  [] ? vfs_read+0xd8/0x130
> [37955.364062]  [] do_page_fault+0x9/0x1088002ead7838
> [37955.364062]  [] page_fault+0x28/0x30
> [37955.364062] Code: 44 24 18 0f 84 87 00 00 00 49 83 7c 24 18 00 78 7b 49 83 
> c5 01 48 8b 4d a8 48 8b 11 48 8d 42 ff 48 85 d2 48 89 01 74 78 4d 39 f7 <49> 
> 8b 06 4c 89 f3 74 6d 49 89 c6 eb a6 0f 1f 84 00 00 00 00 00 
> [37955.364062] RIP  [] list_lru_walk_node+0xab/0x140
> 
> 81127e0e:   48 8b 55 b0 mov-0x50(%rbp),%rdx
> 81127e12:   4c 89 e6mov%r12,%rsi
> 81127e15:   48 89 dfmov%rbx,%rdi
> 81127e18:   ff 55 b8callq  *-0x48(%rbp)   
> # isolate(item, >lock, cb_arg)
> 81127e1b:   83 f8 01cmp$0x1,%eax
> 81127e1e:   74 78   je 81127e98 
> 
> 81127e20:   73 4e   jae81127e70 
> 
> [...]
One interesting thing I have noted here, is that r14 is basically the lower 
half of rbx, with
the upper part borked.

Because we are talking about a single word, this does not seem the usual 
update-half-of-double-word
without locking issue.

>From your excerpt, it is not totally clear what r14 is. But by looking at rdi 
>which
is 0x88002ead77d0 and very probable nlru->lock due to the calling 
convention,
that would indicate that this is nlru->list in case you have spinlock debugging 
enabled.

So yes, someone destroyed our next pointer, and amazingly only half of it.

Still, the only time we ever release this lock is when isolate returns 
LRU_RETRY. Maybe the
way we restart is wrong? (although I can't see how)

An iput() happens outside the lock in that case, but it seems safe : if that 
ends up manipulating
the lru it will do so through our accessors.

I will have to think a bit more... Any other strange thing happening before it ?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: linux-next: slab shrinkers: BUG at mm/list_lru.c:92

2013-06-28 Thread Glauber Costa

On Fri, Jun 28, 2013 at 10:39:43AM +0200, Michal Hocko wrote:
 I have just triggered this one.
 
 [37955.364062] RIP: 0010:[81127e5b]  [81127e5b] 
 list_lru_walk_node+0xab/0x140
 [37955.364062] RSP: :8800374af7b8  EFLAGS: 00010286
 [37955.364062] RAX: 0106 RBX: 88002ead7838 RCX: 
 8800374af830
Note ebx

 [37955.364062] RDX: 0107 RSI: 88001d250dc0 RDI: 
 88002ead77d0
 [37955.364062] RBP: 8800374af818 R08:  R09: 
 88001ffeafc0
 [37955.364062] R10: 0002 R11:  R12: 
 88001d250dc0
 [37955.364062] R13: 00a0 R14: 00572ead7838 R15: 
 88001d250dc8
Note r14


 [37955.364062] Process as (pid: 3351, threadinfo 8800374ae000, task 
 880036d665c0)
 [37955.364062] Stack:
 [37955.364062]  88001da3e700 8800374af830 8800374af838 
 811846d0
 [37955.364062]   88001ce75c48 01ff8800374af838 
 8800374af838
 [37955.364062]   88001ce75800 8800374afa08 
 1014
 [37955.364062] Call Trace:
 [37955.364062]  [811846d0] ? insert_inode_locked+0x160/0x160
 [37955.364062]  [8118496c] prune_icache_sb+0x3c/0x60
 [37955.364062]  [8116dcbe] super_cache_scan+0x12e/0x1b0
 [37955.364062]  [8111354a] shrink_slab_node+0x13a/0x250
 [37955.364062]  [8111671b] shrink_slab+0xab/0x120
 [37955.364062]  [81117944] do_try_to_free_pages+0x264/0x360
 [37955.364062]  [81117d90] try_to_free_pages+0x130/0x180
 [37955.364062]  [81001974] ? __switch_to+0x1b4/0x550
 [37955.364062]  [8110a2fe] __alloc_pages_slowpath+0x39e/0x790
 [37955.364062]  [8110a8ea] __alloc_pages_nodemask+0x1fa/0x210
 [37955.364062]  [8114d1b0] alloc_pages_vma+0xa0/0x120
 [37955.364062]  [81129ebb] do_anonymous_page+0x16b/0x350
 [37955.364062]  [8112f9c5] handle_pte_fault+0x235/0x240
 [37955.364062]  [8107b8b0] ? set_next_entity+0xb0/0xd0
 [37955.364062]  [8112fcbf] handle_mm_fault+0x2ef/0x400
 [37955.364062]  [8157e927] __do_page_fault+0x237/0x4f0
 [37955.364062]  [8116a8a8] ? fsnotify_access+0x68/0x80
 [37955.364062]  [8116b0b8] ? vfs_read+0xd8/0x130
 [37955.364062]  [8157ebe9] do_page_fault+0x9/0x1088002ead7838
 [37955.364062]  [8157b348] page_fault+0x28/0x30
 [37955.364062] Code: 44 24 18 0f 84 87 00 00 00 49 83 7c 24 18 00 78 7b 49 83 
 c5 01 48 8b 4d a8 48 8b 11 48 8d 42 ff 48 85 d2 48 89 01 74 78 4d 39 f7 49 
 8b 06 4c 89 f3 74 6d 49 89 c6 eb a6 0f 1f 84 00 00 00 00 00 
 [37955.364062] RIP  [81127e5b] list_lru_walk_node+0xab/0x140
 
 81127e0e:   48 8b 55 b0 mov-0x50(%rbp),%rdx
 81127e12:   4c 89 e6mov%r12,%rsi
 81127e15:   48 89 dfmov%rbx,%rdi
 81127e18:   ff 55 b8callq  *-0x48(%rbp)   
 # isolate(item, nlru-lock, cb_arg)
 81127e1b:   83 f8 01cmp$0x1,%eax
 81127e1e:   74 78   je 81127e98 
 list_lru_walk_node+0xe8
 81127e20:   73 4e   jae81127e70 
 list_lru_walk_node+0xc0
 [...]
One interesting thing I have noted here, is that r14 is basically the lower 
half of rbx, with
the upper part borked.

Because we are talking about a single word, this does not seem the usual 
update-half-of-double-word
without locking issue.

From your excerpt, it is not totally clear what r14 is. But by looking at rdi 
which
is 0x88002ead77d0 and very probable nlru-lock due to the calling 
convention,
that would indicate that this is nlru-list in case you have spinlock debugging 
enabled.

So yes, someone destroyed our next pointer, and amazingly only half of it.

Still, the only time we ever release this lock is when isolate returns 
LRU_RETRY. Maybe the
way we restart is wrong? (although I can't see how)

An iput() happens outside the lock in that case, but it seems safe : if that 
ends up manipulating
the lru it will do so through our accessors.

I will have to think a bit more... Any other strange thing happening before it ?
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: linux-next: slab shrinkers: BUG at mm/list_lru.c:92

2013-06-23 Thread Glauber Costa

On Sun, Jun 23, 2013 at 03:51:29PM +0400, Glauber Costa wrote:
> On Fri, Jun 21, 2013 at 11:00:21AM +0200, Michal Hocko wrote:
> > On Thu 20-06-13 17:12:01, Michal Hocko wrote:
> > > I am bisecting it again. It is quite tedious, though, because good case
> > > is hard to be sure about.
> > 
> > OK, so now I converged to 2d4fc052 (inode: convert inode lru list to 
> > generic lru
> > list code.) in my tree and I have double checked it matches what is in
> > the linux-next. This doesn't help much to pin point the issue I am
> > afraid :/
> > 
> Can you revert this patch (easiest way ATM is to rewind your tree to a point
> right before it) and apply the following patch?
> 
> As Dave has mentioned, it is very likely that this bug was already there, we
> were just not ever checking imbalances. The attached patch would tell us at
> least if the imbalance was there before. If this is the case, I would suggest
> turning the BUG condition into a WARN_ON_ONCE since we would be officially
> not introducing any regression. It is no less of a bug, though, and we should
> keep looking for it.
> 
> The main change from before / after the patch is that we are now keeping 
> things
> per node. One possibility of having this BUGing would be to have an inode to 
> be
> inserted into one node-lru and removed from another. I cannot see how it could
> happen, because kernel pages are stable in memory and are not moved from node
> to node. We could still have some sort of weird bug in the node calculation
> function. In any case, would it be possible for you to artificially restrict
> your setup to a single node ? Although I have no idea how to do that, we seem
> to have no parameter to disable numa. Maybe booting with less memory, enough 
> to
> fit a single node?
> 
The patch:
diff --git a/fs/inode.c b/fs/inode.c
index 1ddaa2e..0b5c3fa 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -427,6 +427,7 @@ static void inode_lru_list_del(struct inode *inode)
if (!list_empty(>i_lru)) {
list_del_init(>i_lru);
inode->i_sb->s_nr_inodes_unused--;
+   BUG_ON(sb->s_nr_inodes_unused < 0);
this_cpu_dec(nr_unused);
}
spin_unlock(>i_sb->s_inode_lru_lock);
@@ -739,6 +740,7 @@ long prune_icache_sb(struct super_block *sb, unsigned long 
nr_to_scan)
list_del_init(>i_lru);
spin_unlock(>i_lock);
sb->s_nr_inodes_unused--;
+   BUG_ON(sb->s_nr_inodes_unused < 0);
this_cpu_dec(nr_unused);
continue;
}
@@ -777,6 +779,7 @@ long prune_icache_sb(struct super_block *sb, unsigned long 
nr_to_scan)
 
list_move(>i_lru, );
sb->s_nr_inodes_unused--;
+   BUG_ON(sb->s_nr_inodes_unused < 0);
this_cpu_dec(nr_unused);
freed++;
}

Re: linux-next: slab shrinkers: BUG at mm/list_lru.c:92

2013-06-23 Thread Glauber Costa

On Fri, Jun 21, 2013 at 11:00:21AM +0200, Michal Hocko wrote:
> On Thu 20-06-13 17:12:01, Michal Hocko wrote:
> > I am bisecting it again. It is quite tedious, though, because good case
> > is hard to be sure about.
> 
> OK, so now I converged to 2d4fc052 (inode: convert inode lru list to generic 
> lru
> list code.) in my tree and I have double checked it matches what is in
> the linux-next. This doesn't help much to pin point the issue I am
> afraid :/
> 
Can you revert this patch (easiest way ATM is to rewind your tree to a point
right before it) and apply the following patch?

As Dave has mentioned, it is very likely that this bug was already there, we
were just not ever checking imbalances. The attached patch would tell us at
least if the imbalance was there before. If this is the case, I would suggest
turning the BUG condition into a WARN_ON_ONCE since we would be officially
not introducing any regression. It is no less of a bug, though, and we should
keep looking for it.

The main change from before / after the patch is that we are now keeping things
per node. One possibility of having this BUGing would be to have an inode to be
inserted into one node-lru and removed from another. I cannot see how it could
happen, because kernel pages are stable in memory and are not moved from node
to node. We could still have some sort of weird bug in the node calculation
function. In any case, would it be possible for you to artificially restrict
your setup to a single node ? Although I have no idea how to do that, we seem
to have no parameter to disable numa. Maybe booting with less memory, enough to
fit a single node?

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: linux-next: slab shrinkers: BUG at mm/list_lru.c:92

2013-06-23 Thread Glauber Costa

On Fri, Jun 21, 2013 at 11:00:21AM +0200, Michal Hocko wrote:
 On Thu 20-06-13 17:12:01, Michal Hocko wrote:
  I am bisecting it again. It is quite tedious, though, because good case
  is hard to be sure about.
 
 OK, so now I converged to 2d4fc052 (inode: convert inode lru list to generic 
 lru
 list code.) in my tree and I have double checked it matches what is in
 the linux-next. This doesn't help much to pin point the issue I am
 afraid :/
 
Can you revert this patch (easiest way ATM is to rewind your tree to a point
right before it) and apply the following patch?

As Dave has mentioned, it is very likely that this bug was already there, we
were just not ever checking imbalances. The attached patch would tell us at
least if the imbalance was there before. If this is the case, I would suggest
turning the BUG condition into a WARN_ON_ONCE since we would be officially
not introducing any regression. It is no less of a bug, though, and we should
keep looking for it.

The main change from before / after the patch is that we are now keeping things
per node. One possibility of having this BUGing would be to have an inode to be
inserted into one node-lru and removed from another. I cannot see how it could
happen, because kernel pages are stable in memory and are not moved from node
to node. We could still have some sort of weird bug in the node calculation
function. In any case, would it be possible for you to artificially restrict
your setup to a single node ? Although I have no idea how to do that, we seem
to have no parameter to disable numa. Maybe booting with less memory, enough to
fit a single node?

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: linux-next: slab shrinkers: BUG at mm/list_lru.c:92

2013-06-23 Thread Glauber Costa

On Sun, Jun 23, 2013 at 03:51:29PM +0400, Glauber Costa wrote:
 On Fri, Jun 21, 2013 at 11:00:21AM +0200, Michal Hocko wrote:
  On Thu 20-06-13 17:12:01, Michal Hocko wrote:
   I am bisecting it again. It is quite tedious, though, because good case
   is hard to be sure about.
  
  OK, so now I converged to 2d4fc052 (inode: convert inode lru list to 
  generic lru
  list code.) in my tree and I have double checked it matches what is in
  the linux-next. This doesn't help much to pin point the issue I am
  afraid :/
  
 Can you revert this patch (easiest way ATM is to rewind your tree to a point
 right before it) and apply the following patch?
 
 As Dave has mentioned, it is very likely that this bug was already there, we
 were just not ever checking imbalances. The attached patch would tell us at
 least if the imbalance was there before. If this is the case, I would suggest
 turning the BUG condition into a WARN_ON_ONCE since we would be officially
 not introducing any regression. It is no less of a bug, though, and we should
 keep looking for it.
 
 The main change from before / after the patch is that we are now keeping 
 things
 per node. One possibility of having this BUGing would be to have an inode to 
 be
 inserted into one node-lru and removed from another. I cannot see how it could
 happen, because kernel pages are stable in memory and are not moved from node
 to node. We could still have some sort of weird bug in the node calculation
 function. In any case, would it be possible for you to artificially restrict
 your setup to a single node ? Although I have no idea how to do that, we seem
 to have no parameter to disable numa. Maybe booting with less memory, enough 
 to
 fit a single node?
 
The patch:
diff --git a/fs/inode.c b/fs/inode.c
index 1ddaa2e..0b5c3fa 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -427,6 +427,7 @@ static void inode_lru_list_del(struct inode *inode)
if (!list_empty(inode-i_lru)) {
list_del_init(inode-i_lru);
inode-i_sb-s_nr_inodes_unused--;
+   BUG_ON(sb-s_nr_inodes_unused  0);
this_cpu_dec(nr_unused);
}
spin_unlock(inode-i_sb-s_inode_lru_lock);
@@ -739,6 +740,7 @@ long prune_icache_sb(struct super_block *sb, unsigned long 
nr_to_scan)
list_del_init(inode-i_lru);
spin_unlock(inode-i_lock);
sb-s_nr_inodes_unused--;
+   BUG_ON(sb-s_nr_inodes_unused  0);
this_cpu_dec(nr_unused);
continue;
}
@@ -777,6 +779,7 @@ long prune_icache_sb(struct super_block *sb, unsigned long 
nr_to_scan)
 
list_move(inode-i_lru, freeable);
sb-s_nr_inodes_unused--;
+   BUG_ON(sb-s_nr_inodes_unused  0);
this_cpu_dec(nr_unused);
freed++;
}

Re: linux-next: slab shrinkers: BUG at mm/list_lru.c:92

2013-06-20 Thread Glauber Costa

On Wed, Jun 19, 2013 at 04:28:01PM +0200, Michal Hocko wrote:
> On Wed 19-06-13 09:13:46, Michal Hocko wrote:
> > On Tue 18-06-13 10:26:24, Glauber Costa wrote:
> > [...]
> > > Michal, would you mind testing the following patch?
> > >
> > > diff --git a/fs/inode.c b/fs/inode.c
> > > index 00b804e..48eafa6 100644
> > > --- a/fs/inode.c
> > > +++ b/fs/inode.c
> > > @@ -419,6 +419,8 @@ void inode_add_lru(struct inode *inode)
> > >  
> > >  static void inode_lru_list_del(struct inode *inode)
> > >  {
> > > + if (inode->i_state & I_FREEING)
> > > + return;
> > >  
> > >   if (list_lru_del(>i_sb->s_inode_lru, >i_lru))
> > >   this_cpu_dec(nr_unused);
> > > @@ -609,8 +611,8 @@ void evict_inodes(struct super_block *sb)
> > >   continue;
> > >   }
> > >  
> > > - inode->i_state |= I_FREEING;
> > >   inode_lru_list_del(inode);
> > > + inode->i_state |= I_FREEING;
> > >   spin_unlock(>i_lock);
> > >   list_add(>i_lru, );
> > >   }
> > > @@ -653,8 +655,8 @@ int invalidate_inodes(struct super_block *sb, bool 
> > > kill_dirty)
> > >   continue;
> > >   }
> > >  
> > > - inode->i_state |= I_FREEING;
> > >   inode_lru_list_del(inode);
> > > + inode->i_state |= I_FREEING;
> > >   spin_unlock(>i_lock);
> > >   list_add(>i_lru, );
> > >   }
> > > @@ -1381,9 +1383,8 @@ static void iput_final(struct inode *inode)
> > >   inode->i_state &= ~I_WILL_FREE;
> > >   }
> > >  
> > > + inode_lru_list_del(inode);
> > >   inode->i_state |= I_FREEING;
> > > - if (!list_empty(>i_lru))
> > > - inode_lru_list_del(inode);
> > >   spin_unlock(>i_lock);
> > >  
> > >   evict(inode);
> > 
> > No luck. I have this on top of inode_lru_isolate one but still can see
> 
> And I was lucky enough to hit another BUG_ON with this kernel (the above
> patch and inode_lru_isolate-fix):
> [84091.219056] [ cut here ]
> [84091.220015] kernel BUG at mm/list_lru.c:42!
> [84091.220015] invalid opcode:  [#1] SMP 
> [84091.220015] Modules linked in: edd nfsv3 nfs_acl nfs fscache lockd sunrpc 
> af_packet bridge stp llc cpufreq_conservative cpufreq_userspace 
> cpufreq_powersave fuse loop dm_mod powernow_k8 tg3 kvm_amd kvm ptp e1000 
> pps_core shpchp edac_core i2c_amd756 amd_rng pci_hotplug k8temp sg 
> i2c_amd8111 edac_mce_amd serio_raw sr_mod pcspkr cdrom button ohci_hcd 
> ehci_hcd usbcore usb_common processor thermal_sys scsi_dh_emc scsi_dh_rdac 
> scsi_dh_hp_sw scsi_dh ata_generic sata_sil pata_amd
> [84091.220015] CPU 1 
> [84091.220015] Pid: 32545, comm: rm Not tainted 3.9.0mmotmdebugging1+ #1472 
> AMD A8440/WARTHOG
> [84091.220015] RIP: 0010:[]  [] 
> list_lru_del+0xcf/0xe0
> [84091.220015] RSP: 0018:88001de85df8  EFLAGS: 00010286
> [84091.220015] RAX:  RBX: 88001e1ce2c0 RCX: 
> 0002
> [84091.220015] RDX: 88001e1ce2c8 RSI: 8800087f4220 RDI: 
> 88001e1ce2c0
> [84091.220015] RBP: 88001de85e18 R08:  R09: 
> 
> [84091.220015] R10: 88001d539128 R11: 880018234882 R12: 
> 8800087f4220
> [84091.220015] R13: 88001c68bc40 R14:  R15: 
> 88001de85ea8
> [84091.220015] FS:  7f43adb30700() GS:88001f10() 
> knlGS:
> [84091.220015] CS:  0010 DS:  ES:  CR0: 8005003b
> [84091.220015] CR2: 01ffed30 CR3: 1e02e000 CR4: 
> 07e0
> [84091.220015] DR0:  DR1:  DR2: 
> 
> [84091.220015] DR3:  DR6: 0ff0 DR7: 
> 0400
> [84091.220015] Process rm (pid: 32545, threadinfo 88001de84000, task 
> 88001c22e5c0)
> [84091.220015] Stack:
> [84091.220015]  8800087f4130 8800087f41b8 88001c68b800 
> 
> [84091.220015]  88001de85e48 81184357 88001de85e48 
> 8800087f4130
> [84091.220015]  88001e005000 880014e4eb40 88001de85e68 
> 81184418
> [84091.220015] Call Trace:
> [84091.220015]  [] iput_final+0x117/0x190
> [84091.220015]  [] iput+0x48/0x60
> [84091.220015]  [] do_unlinkat+0x214/0x240
> [84091.220015]  [] sys_unlinkat+0x1d/0x40
> [84091.220015]  [] system_call_fastpath+0x16/0x1b
> [84091.2

Re: linux-next: slab shrinkers: BUG at mm/list_lru.c:92

2013-06-20 Thread Glauber Costa

On Wed, Jun 19, 2013 at 04:28:01PM +0200, Michal Hocko wrote:
 On Wed 19-06-13 09:13:46, Michal Hocko wrote:
  On Tue 18-06-13 10:26:24, Glauber Costa wrote:
  [...]
   Michal, would you mind testing the following patch?
  
   diff --git a/fs/inode.c b/fs/inode.c
   index 00b804e..48eafa6 100644
   --- a/fs/inode.c
   +++ b/fs/inode.c
   @@ -419,6 +419,8 @@ void inode_add_lru(struct inode *inode)

static void inode_lru_list_del(struct inode *inode)
{
   + if (inode-i_state  I_FREEING)
   + return;

 if (list_lru_del(inode-i_sb-s_inode_lru, inode-i_lru))
 this_cpu_dec(nr_unused);
   @@ -609,8 +611,8 @@ void evict_inodes(struct super_block *sb)
 continue;
 }

   - inode-i_state |= I_FREEING;
 inode_lru_list_del(inode);
   + inode-i_state |= I_FREEING;
 spin_unlock(inode-i_lock);
 list_add(inode-i_lru, dispose);
 }
   @@ -653,8 +655,8 @@ int invalidate_inodes(struct super_block *sb, bool 
   kill_dirty)
 continue;
 }

   - inode-i_state |= I_FREEING;
 inode_lru_list_del(inode);
   + inode-i_state |= I_FREEING;
 spin_unlock(inode-i_lock);
 list_add(inode-i_lru, dispose);
 }
   @@ -1381,9 +1383,8 @@ static void iput_final(struct inode *inode)
 inode-i_state = ~I_WILL_FREE;
 }

   + inode_lru_list_del(inode);
 inode-i_state |= I_FREEING;
   - if (!list_empty(inode-i_lru))
   - inode_lru_list_del(inode);
 spin_unlock(inode-i_lock);

 evict(inode);
  
  No luck. I have this on top of inode_lru_isolate one but still can see
 
 And I was lucky enough to hit another BUG_ON with this kernel (the above
 patch and inode_lru_isolate-fix):
 [84091.219056] [ cut here ]
 [84091.220015] kernel BUG at mm/list_lru.c:42!
 [84091.220015] invalid opcode:  [#1] SMP 
 [84091.220015] Modules linked in: edd nfsv3 nfs_acl nfs fscache lockd sunrpc 
 af_packet bridge stp llc cpufreq_conservative cpufreq_userspace 
 cpufreq_powersave fuse loop dm_mod powernow_k8 tg3 kvm_amd kvm ptp e1000 
 pps_core shpchp edac_core i2c_amd756 amd_rng pci_hotplug k8temp sg 
 i2c_amd8111 edac_mce_amd serio_raw sr_mod pcspkr cdrom button ohci_hcd 
 ehci_hcd usbcore usb_common processor thermal_sys scsi_dh_emc scsi_dh_rdac 
 scsi_dh_hp_sw scsi_dh ata_generic sata_sil pata_amd
 [84091.220015] CPU 1 
 [84091.220015] Pid: 32545, comm: rm Not tainted 3.9.0mmotmdebugging1+ #1472 
 AMD A8440/WARTHOG
 [84091.220015] RIP: 0010:[81127fff]  [81127fff] 
 list_lru_del+0xcf/0xe0
 [84091.220015] RSP: 0018:88001de85df8  EFLAGS: 00010286
 [84091.220015] RAX:  RBX: 88001e1ce2c0 RCX: 
 0002
 [84091.220015] RDX: 88001e1ce2c8 RSI: 8800087f4220 RDI: 
 88001e1ce2c0
 [84091.220015] RBP: 88001de85e18 R08:  R09: 
 
 [84091.220015] R10: 88001d539128 R11: 880018234882 R12: 
 8800087f4220
 [84091.220015] R13: 88001c68bc40 R14:  R15: 
 88001de85ea8
 [84091.220015] FS:  7f43adb30700() GS:88001f10() 
 knlGS:
 [84091.220015] CS:  0010 DS:  ES:  CR0: 8005003b
 [84091.220015] CR2: 01ffed30 CR3: 1e02e000 CR4: 
 07e0
 [84091.220015] DR0:  DR1:  DR2: 
 
 [84091.220015] DR3:  DR6: 0ff0 DR7: 
 0400
 [84091.220015] Process rm (pid: 32545, threadinfo 88001de84000, task 
 88001c22e5c0)
 [84091.220015] Stack:
 [84091.220015]  8800087f4130 8800087f41b8 88001c68b800 
 
 [84091.220015]  88001de85e48 81184357 88001de85e48 
 8800087f4130
 [84091.220015]  88001e005000 880014e4eb40 88001de85e68 
 81184418
 [84091.220015] Call Trace:
 [84091.220015]  [81184357] iput_final+0x117/0x190
 [84091.220015]  [81184418] iput+0x48/0x60
 [84091.220015]  [8117a804] do_unlinkat+0x214/0x240
 [84091.220015]  [8117aa4d] sys_unlinkat+0x1d/0x40
 [84091.220015]  [81583129] system_call_fastpath+0x16/0x1b
 [84091.220015] Code: 5c 41 5d b8 01 00 00 00 41 5e c9 c3 49 8d 45 08 f0 45 0f 
 b3 75 08 eb db 0f 1f 40 00 66 83 03 01 5b 41 5c 41 5d 31 c0 41 5e c9 c3 0f 
 0b eb fe 66 66 66 66 2e 0f 1f 84 00 00 00 00 00 55 ba 00 00 
 [84091.220015] RIP  [81127fff] list_lru_del+0xcf/0xe0
 [84091.220015]  RSP 88001de85df8
 [84091.470390] ---[ end trace e6915e8ee0f5f079 ]---
 
 Which is BUG_ON(nlru-nr_items  0) from iput_final path. So it seems
 that there is still a race there.

I am still looking at this - still can't reproduce, still don't know what is 
going
on.

Could you share with me your .config and your hardware info and dmesg? In 
particular, I want
to know how many nodes do you have.
--
To unsubscribe from this list

Re: linux-next: manual merge of the akpm tree with the ext4 tree

2013-06-19 Thread Glauber Costa

On Wed, Jun 19, 2013 at 02:59:17PM -0400, Theodore Ts'o wrote:
> On Wed, Jun 19, 2013 at 10:06:35AM -0700, Andrew Morton wrote:
> > 
> > Merge the ext4 change early, please.  The core shrinker changes aren't
> > 100% certain at this time - first they need to stop oopsing ;)
> 
> Ack, sounds like a plan.
> 
> Are they oopsing often enough that they are likely to interfere with
> running regression tests on linux-next?
> 
I have only received one report so far. It will really depend on how likely it
is for other people to hit it - wether or not other people hit it easily is also
good info...
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: linux-next: slab shrinkers: BUG at mm/list_lru.c:92

2013-06-19 Thread Glauber Costa

On Wed, Jun 19, 2013 at 03:57:16PM +0200, Michal Hocko wrote:
> On Wed 19-06-13 11:35:27, Glauber Costa wrote:
> [...]
> > Sorry if you said that before Michal.
> > 
> > But given the backtrace, are you sure this is LRU-related?
> 
> No idea. I just know that my mm tree behaves correctly after the whole
> series has been reverted (58f6e0c8fb37e8e37d5ac17a61a53ac236c15047) and
> before the latest version of the patchset has been applied.
> 
> > You mentioned you bisected it but found nothing conclusive.
> 
> Yes, but I was interested in crashes and not hangs so I will try it
> again.
> 
> I really hope this is not just some stupidness in my tree.
> 
Okay. Just looking at the stack trace you provided me it would be hard
to implicate us. But it is not totally unreasonable either since we touch
things around this. I would right now assign more probability to a tricky
bug than to some misconfiguration on your side.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: linux-next: slab shrinkers: BUG at mm/list_lru.c:92

2013-06-19 Thread Glauber Costa

On Wed, Jun 19, 2013 at 11:35:27AM +0400, Glauber Costa wrote:
> On Wed, Jun 19, 2013 at 09:13:46AM +0200, Michal Hocko wrote:
> > On Tue 18-06-13 10:26:24, Glauber Costa wrote:
> > [...]
> > > Michal, would you mind testing the following patch?
> > >
> > > diff --git a/fs/inode.c b/fs/inode.c
> > > index 00b804e..48eafa6 100644
> > > --- a/fs/inode.c
> > > +++ b/fs/inode.c
> > > @@ -419,6 +419,8 @@ void inode_add_lru(struct inode *inode)
> > >  
> > >  static void inode_lru_list_del(struct inode *inode)
> > >  {
> > > + if (inode->i_state & I_FREEING)
> > > + return;
> > >  
> > >   if (list_lru_del(>i_sb->s_inode_lru, >i_lru))
> > >   this_cpu_dec(nr_unused);
> > > @@ -609,8 +611,8 @@ void evict_inodes(struct super_block *sb)
> > >   continue;
> > >   }
> > >  
> > > - inode->i_state |= I_FREEING;
> > >   inode_lru_list_del(inode);
> > > + inode->i_state |= I_FREEING;
> > >   spin_unlock(>i_lock);
> > >   list_add(>i_lru, );
> > >   }
> > > @@ -653,8 +655,8 @@ int invalidate_inodes(struct super_block *sb, bool 
> > > kill_dirty)
> > >   continue;
> > >   }
> > >  
> > > - inode->i_state |= I_FREEING;
> > >   inode_lru_list_del(inode);
> > > + inode->i_state |= I_FREEING;
> > >   spin_unlock(>i_lock);
> > >   list_add(>i_lru, );
> > >   }
> > > @@ -1381,9 +1383,8 @@ static void iput_final(struct inode *inode)
> > >   inode->i_state &= ~I_WILL_FREE;
> > >   }
> > >  
> > > + inode_lru_list_del(inode);
> > >   inode->i_state |= I_FREEING;
> > > - if (!list_empty(>i_lru))
> > > - inode_lru_list_del(inode);
> > >   spin_unlock(>i_lock);
> > >  
> > >   evict(inode);
> > 
> > No luck. I have this on top of inode_lru_isolate one but still can see
> > hangs:
> > 911 [] __wait_on_freeing_inode+0x9e/0xc0
> > [] find_inode_fast+0xa1/0xc0
> > [] iget_locked+0x4f/0x180
> > [] ext4_iget+0x33/0x9f0
> > [] ext4_lookup+0xbc/0x160
> > [] lookup_real+0x20/0x60
> > [] __lookup_hash+0x34/0x40
> > [] path_lookupat+0x7a2/0x830
> > [] filename_lookup+0x33/0xd0
> > [] user_path_at_empty+0x7b/0xb0
> > [] user_path_at+0xc/0x10
> > [] vfs_fstatat+0x51/0xb0
> > [] vfs_stat+0x16/0x20
> > [] sys_newstat+0x1f/0x50
> > [] system_call_fastpath+0x16/0x1b
> > [] 0x
> > 21409 [] __wait_on_freeing_inode+0x9e/0xc0
> > [] find_inode_fast+0xa1/0xc0
> > [] iget_locked+0x4f/0x180
> > [] ext4_iget+0x33/0x9f0
> > [] ext4_lookup+0xbc/0x160
> > [] lookup_real+0x20/0x60
> > [] lookup_open+0x175/0x1d0
> > [] do_last+0x2de/0x780
> > [] path_openat+0xda/0x400
> > [] do_filp_open+0x43/0xa0
> > [] do_sys_open+0x160/0x1e0
> > [] sys_open+0x1c/0x20
> > [] system_call_fastpath+0x16/0x1b
> > [] 0x
> > 21745 [] path_lookupat+0x792/0x830
> > [] filename_lookup+0x33/0xd0
> > [] user_path_at_empty+0x7b/0xb0
> > [] user_path_at+0xc/0x10
> > [] vfs_fstatat+0x51/0xb0
> > [] vfs_stat+0x16/0x20
> > [] sys_newstat+0x1f/0x50
> > [] system_call_fastpath+0x16/0x1b
> > [] 0x
> > 22032 [] path_lookupat+0x792/0x830
> > [] filename_lookup+0x33/0xd0
> > [] user_path_at_empty+0x7b/0xb0
> > [] user_path_at+0xc/0x10
> > [] vfs_fstatat+0x51/0xb0
> > [] vfs_stat+0x16/0x20
> > [] sys_newstat+0x1f/0x50
> > [] system_call_fastpath+0x16/0x1b
> > [] 0x
> > 22621 [] __wait_on_freeing_inode+0x9e/0xc0
> > [] find_inode_fast+0xa1/0xc0
> > [] iget_locked+0x4f/0x180
> > [] ext4_iget+0x33/0x9f0
> > [] ext4_lookup+0xbc/0x160
> > [] lookup_real+0x20/0x60
> > [] lookup_open+0x175/0x1d0
> > [] do_last+0x2de/0x780
> > [] path_openat+0xda/0x400
> > [] do_filp_open+0x43/0xa0
> > [] do_sys_open+0x160/0x1e0
> > [] sys_open+0x1c/0x20
> > [] system_call_fastpath+0x16/0x1b
> > [] 0x
> > 22711 [] do_last+0x2c4/0x780
> > [] path_openat+0xda/0x400
> > [] do_filp_open+0x43/0xa0
> > [] do_sys_open+0x160/0x1e0
> > [] sys_open+0x1c/0x20
> > [] system_call_fastpath+0x16/0x1b
> > [] 0x
> > 22946 [] do_last+0x2c4/0x780
> > [] path_ope

Re: linux-next: manual merge of the akpm tree with the ext4 tree

2013-06-19 Thread Glauber Costa

On Wed, Jun 19, 2013 at 05:27:21PM +1000, Stephen Rothwell wrote:
> Hi Andrew,
> 
> Today's linux-next merge of the akpm tree got a conflict in
> fs/ext4/extents_status.c between commit 6480bad916be ("ext4: improve
> extent cache shrink mechanism to avoid to burn CPU time") from the ext
> tree and commit 1f42d0934b4e ("fs: convert fs shrinkers to new scan/count
> API") from the akpm tree.
> 
> I fixed it up (I am not sure if the result makes complete sense - see
> below) and can carry the fix as necessary (no action is required).
> 
> -- 
> Cheers,
> Stephen Rothwells...@canb.auug.org.au
> 
> diff --cc fs/ext4/extents_status.c
> index 80dcc59,4bce4f0..000
> --- a/fs/ext4/extents_status.c
> +++ b/fs/ext4/extents_status.c
> @@@ -876,58 -878,32 +876,63 @@@ int ext4_es_zeroout(struct inode *inode
>EXTENT_STATUS_WRITTEN);
>   }
>   
>  +static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a,
>  + struct list_head *b)
>  +{
>  +struct ext4_inode_info *eia, *eib;
>  +unsigned long diff;
>  +
>  +eia = list_entry(a, struct ext4_inode_info, i_es_lru);
>  +eib = list_entry(b, struct ext4_inode_info, i_es_lru);
>  +
>  +diff = eia->i_touch_when - eib->i_touch_when;
>  +if (diff < 0)
>  +return -1;
>  +if (diff > 0)
>  +return 1;
>  +return 0;
>  +}
>   
This comes straight from ext4, so no problem.

Now we just need to make sure that the shrinker is clearly separated
between a count part and a scan part.

> - static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control 
> *sc)
> + static long ext4_es_count(struct shrinker *shrink, struct shrink_control 
> *sc)
> + {
> + long nr;
> + struct ext4_sb_info *sbi = container_of(shrink,
> + struct ext4_sb_info, s_es_shrinker);
> + 
> + nr = percpu_counter_read_positive(>s_extent_cache_cnt);
> + trace_ext4_es_shrink_enter(sbi->s_sb, sc->nr_to_scan, nr);
> + return nr;
> + }
The count seems okay.

> + 
> + static long ext4_es_scan(struct shrinker *shrink, struct shrink_control *sc)
>   {
>   struct ext4_sb_info *sbi = container_of(shrink,
>   struct ext4_sb_info, s_es_shrinker);
>   struct ext4_inode_info *ei;
>  -struct list_head *cur, *tmp, scanned;
>  +struct list_head *cur, *tmp;
>  +LIST_HEAD(skiped);
>   int nr_to_scan = sc->nr_to_scan;
> - int ret, nr_shrunk = 0;
> - 
> - ret = percpu_counter_read_positive(>s_extent_cache_cnt);
> - trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret);
> - 
> - if (!nr_to_scan)
> - return ret;
> + int ret = 0, nr_shrunk = 0;
>   
>  -INIT_LIST_HEAD();
>  -
>   spin_lock(>s_es_lru_lock);
>  +
>  +/*
>  + * If the inode that is at the head of LRU list is newer than
>  + * last_sorted time, that means that we need to sort this list.
>  + */
>  +ei = list_first_entry(>s_es_lru, struct ext4_inode_info, i_es_lru);
>  +if (sbi->s_es_last_sorted < ei->i_touch_when) {
>  +list_sort(NULL, >s_es_lru, ext4_inode_touch_time_cmp);
>  +sbi->s_es_last_sorted = jiffies;
>  +}
>  +
They are sorting the list, but that doesn't change the ammount of items they 
report.
So far it is fine. My only concern here would be locking, but it seems to be all
protected by the s_es_lru_lock.

>   list_for_each_safe(cur, tmp, >s_es_lru) {
>  -list_move_tail(cur, );
>  +/*
>  + * If we have already reclaimed all extents from extent
>  + * status tree, just stop the loop immediately.
>  + */
>  +if (percpu_counter_read_positive(>s_extent_cache_cnt) == 0)
>  +break;
>   
>   ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
>   
> @@@ -951,22 -923,22 +956,22 @@@
>   if (nr_to_scan == 0)
>   break;
>   }
>  -list_splice_tail(, >s_es_lru);
>  +
>  +/* Move the newer inodes into the tail of the LRU list. */
>  +list_splice_tail(, >s_es_lru);
>   spin_unlock(>s_es_lru_lock);
>   
> - ret = percpu_counter_read_positive(>s_extent_cache_cnt);
>   trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret);
> - return ret;
> + return nr_shrunk;
>   }
>   
This part also seems fine.

>  -void ext4_es_register_shrinker(struct super_block *sb)
>  +void ext4_es_register_shrinker(struct ext4_sb_info *sbi)
>   {
>  -struct ext4_sb_info *sbi;
>  -
>  -sbi = EXT4_SB(sb);
>   INIT_LIST_HEAD(>s_es_lru);
>   spin_lock_init(>s_es_lru_lock);
>  +sbi->s_es_last_sorted = 0;
> - sbi->s_es_shrinker.shrink = ext4_es_shrink;
> + sbi->s_es_shrinker.scan_objects = ext4_es_scan;
> + sbi->s_es_shrinker.count_objects = ext4_es_count;
>   sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
>

Re: linux-next: slab shrinkers: BUG at mm/list_lru.c:92

2013-06-19 Thread Glauber Costa

On Wed, Jun 19, 2013 at 09:13:46AM +0200, Michal Hocko wrote:
> On Tue 18-06-13 10:26:24, Glauber Costa wrote:
> [...]
> > Michal, would you mind testing the following patch?
> >
> > diff --git a/fs/inode.c b/fs/inode.c
> > index 00b804e..48eafa6 100644
> > --- a/fs/inode.c
> > +++ b/fs/inode.c
> > @@ -419,6 +419,8 @@ void inode_add_lru(struct inode *inode)
> >  
> >  static void inode_lru_list_del(struct inode *inode)
> >  {
> > +   if (inode->i_state & I_FREEING)
> > +   return;
> >  
> > if (list_lru_del(>i_sb->s_inode_lru, >i_lru))
> > this_cpu_dec(nr_unused);
> > @@ -609,8 +611,8 @@ void evict_inodes(struct super_block *sb)
> > continue;
> > }
> >  
> > -   inode->i_state |= I_FREEING;
> > inode_lru_list_del(inode);
> > +   inode->i_state |= I_FREEING;
> > spin_unlock(>i_lock);
> > list_add(>i_lru, );
> > }
> > @@ -653,8 +655,8 @@ int invalidate_inodes(struct super_block *sb, bool 
> > kill_dirty)
> > continue;
> > }
> >  
> > -   inode->i_state |= I_FREEING;
> > inode_lru_list_del(inode);
> > +   inode->i_state |= I_FREEING;
> > spin_unlock(>i_lock);
> > list_add(>i_lru, );
> > }
> > @@ -1381,9 +1383,8 @@ static void iput_final(struct inode *inode)
> > inode->i_state &= ~I_WILL_FREE;
> > }
> >  
> > +   inode_lru_list_del(inode);
> > inode->i_state |= I_FREEING;
> > -   if (!list_empty(>i_lru))
> > -   inode_lru_list_del(inode);
> > spin_unlock(>i_lock);
> >  
> > evict(inode);
> 
> No luck. I have this on top of inode_lru_isolate one but still can see
> hangs:
> 911 [] __wait_on_freeing_inode+0x9e/0xc0
> [] find_inode_fast+0xa1/0xc0
> [] iget_locked+0x4f/0x180
> [] ext4_iget+0x33/0x9f0
> [] ext4_lookup+0xbc/0x160
> [] lookup_real+0x20/0x60
> [] __lookup_hash+0x34/0x40
> [] path_lookupat+0x7a2/0x830
> [] filename_lookup+0x33/0xd0
> [] user_path_at_empty+0x7b/0xb0
> [] user_path_at+0xc/0x10
> [] vfs_fstatat+0x51/0xb0
> [] vfs_stat+0x16/0x20
> [] sys_newstat+0x1f/0x50
> [] system_call_fastpath+0x16/0x1b
> [] 0x
> 21409 [] __wait_on_freeing_inode+0x9e/0xc0
> [] find_inode_fast+0xa1/0xc0
> [] iget_locked+0x4f/0x180
> [] ext4_iget+0x33/0x9f0
> [] ext4_lookup+0xbc/0x160
> [] lookup_real+0x20/0x60
> [] lookup_open+0x175/0x1d0
> [] do_last+0x2de/0x780
> [] path_openat+0xda/0x400
> [] do_filp_open+0x43/0xa0
> [] do_sys_open+0x160/0x1e0
> [] sys_open+0x1c/0x20
> [] system_call_fastpath+0x16/0x1b
> [] 0x
> 21745 [] path_lookupat+0x792/0x830
> [] filename_lookup+0x33/0xd0
> [] user_path_at_empty+0x7b/0xb0
> [] user_path_at+0xc/0x10
> [] vfs_fstatat+0x51/0xb0
> [] vfs_stat+0x16/0x20
> [] sys_newstat+0x1f/0x50
> [] system_call_fastpath+0x16/0x1b
> [] 0x
> 22032 [] path_lookupat+0x792/0x830
> [] filename_lookup+0x33/0xd0
> [] user_path_at_empty+0x7b/0xb0
> [] user_path_at+0xc/0x10
> [] vfs_fstatat+0x51/0xb0
> [] vfs_stat+0x16/0x20
> [] sys_newstat+0x1f/0x50
> [] system_call_fastpath+0x16/0x1b
> [] 0x
> 22621 [] __wait_on_freeing_inode+0x9e/0xc0
> [] find_inode_fast+0xa1/0xc0
> [] iget_locked+0x4f/0x180
> [] ext4_iget+0x33/0x9f0
> [] ext4_lookup+0xbc/0x160
> [] lookup_real+0x20/0x60
> [] lookup_open+0x175/0x1d0
> [] do_last+0x2de/0x780
> [] path_openat+0xda/0x400
> [] do_filp_open+0x43/0xa0
> [] do_sys_open+0x160/0x1e0
> [] sys_open+0x1c/0x20
> [] system_call_fastpath+0x16/0x1b
> [] 0x
> 22711 [] do_last+0x2c4/0x780
> [] path_openat+0xda/0x400
> [] do_filp_open+0x43/0xa0
> [] do_sys_open+0x160/0x1e0
> [] sys_open+0x1c/0x20
> [] system_call_fastpath+0x16/0x1b
> [] 0x
> 22946 [] do_last+0x2c4/0x780
> [] path_openat+0xda/0x400
> [] do_filp_open+0x43/0xa0
> [] do_sys_open+0x160/0x1e0
> [] sys_open+0x1c/0x20
> [] system_call_fastpath+0x16/0x1b
> [] 0x
> 23393 [] do_last+0x2c4/0x780
> [] path_openat+0xda/0x400
> [] do_filp_open+0x43/0xa0
> [] do_sys_open+0x160/0x1e0
> [] sys_open+0x1c/0x20
> [] system_call_fastpath+0x16/0x1b
> [] 0x
> -- 
Sorry if you said that before Michal.

But given the backtrace, are you sure this is LRU-related? You mentioned you 
bisected
it but found nothing conclusive. I will keep looking but maybe this could 
benefit from
a broader fs look

In any case, the patch we suggested is obviously correct and we should apply 
nevertheless.
I will write it down and send it to Andrew.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: linux-next: slab shrinkers: BUG at mm/list_lru.c:92

2013-06-19 Thread Glauber Costa

On Wed, Jun 19, 2013 at 09:13:46AM +0200, Michal Hocko wrote:
 On Tue 18-06-13 10:26:24, Glauber Costa wrote:
 [...]
  Michal, would you mind testing the following patch?
 
  diff --git a/fs/inode.c b/fs/inode.c
  index 00b804e..48eafa6 100644
  --- a/fs/inode.c
  +++ b/fs/inode.c
  @@ -419,6 +419,8 @@ void inode_add_lru(struct inode *inode)
   
   static void inode_lru_list_del(struct inode *inode)
   {
  +   if (inode-i_state  I_FREEING)
  +   return;
   
  if (list_lru_del(inode-i_sb-s_inode_lru, inode-i_lru))
  this_cpu_dec(nr_unused);
  @@ -609,8 +611,8 @@ void evict_inodes(struct super_block *sb)
  continue;
  }
   
  -   inode-i_state |= I_FREEING;
  inode_lru_list_del(inode);
  +   inode-i_state |= I_FREEING;
  spin_unlock(inode-i_lock);
  list_add(inode-i_lru, dispose);
  }
  @@ -653,8 +655,8 @@ int invalidate_inodes(struct super_block *sb, bool 
  kill_dirty)
  continue;
  }
   
  -   inode-i_state |= I_FREEING;
  inode_lru_list_del(inode);
  +   inode-i_state |= I_FREEING;
  spin_unlock(inode-i_lock);
  list_add(inode-i_lru, dispose);
  }
  @@ -1381,9 +1383,8 @@ static void iput_final(struct inode *inode)
  inode-i_state = ~I_WILL_FREE;
  }
   
  +   inode_lru_list_del(inode);
  inode-i_state |= I_FREEING;
  -   if (!list_empty(inode-i_lru))
  -   inode_lru_list_del(inode);
  spin_unlock(inode-i_lock);
   
  evict(inode);
 
 No luck. I have this on top of inode_lru_isolate one but still can see
 hangs:
 911 [8118325e] __wait_on_freeing_inode+0x9e/0xc0
 [81183321] find_inode_fast+0xa1/0xc0
 [8118529f] iget_locked+0x4f/0x180
 [811efa23] ext4_iget+0x33/0x9f0
 [811f6a5c] ext4_lookup+0xbc/0x160
 [81174ad0] lookup_real+0x20/0x60
 [81175254] __lookup_hash+0x34/0x40
 [81179872] path_lookupat+0x7a2/0x830
 [81179933] filename_lookup+0x33/0xd0
 [8117ab0b] user_path_at_empty+0x7b/0xb0
 [8117ab4c] user_path_at+0xc/0x10
 [8116ff91] vfs_fstatat+0x51/0xb0
 [81170116] vfs_stat+0x16/0x20
 [8117013f] sys_newstat+0x1f/0x50
 [81583129] system_call_fastpath+0x16/0x1b
 [] 0x
 21409 [8118325e] __wait_on_freeing_inode+0x9e/0xc0
 [81183321] find_inode_fast+0xa1/0xc0
 [8118529f] iget_locked+0x4f/0x180
 [811efa23] ext4_iget+0x33/0x9f0
 [811f6a5c] ext4_lookup+0xbc/0x160
 [81174ad0] lookup_real+0x20/0x60
 [81177e25] lookup_open+0x175/0x1d0
 [8117815e] do_last+0x2de/0x780
 [8117ae9a] path_openat+0xda/0x400
 [8117b303] do_filp_open+0x43/0xa0
 [81168ee0] do_sys_open+0x160/0x1e0
 [81168f9c] sys_open+0x1c/0x20
 [81583129] system_call_fastpath+0x16/0x1b
 [] 0x
 21745 [81179862] path_lookupat+0x792/0x830
 [81179933] filename_lookup+0x33/0xd0
 [8117ab0b] user_path_at_empty+0x7b/0xb0
 [8117ab4c] user_path_at+0xc/0x10
 [8116ff91] vfs_fstatat+0x51/0xb0
 [81170116] vfs_stat+0x16/0x20
 [8117013f] sys_newstat+0x1f/0x50
 [81583129] system_call_fastpath+0x16/0x1b
 [] 0x
 22032 [81179862] path_lookupat+0x792/0x830
 [81179933] filename_lookup+0x33/0xd0
 [8117ab0b] user_path_at_empty+0x7b/0xb0
 [8117ab4c] user_path_at+0xc/0x10
 [8116ff91] vfs_fstatat+0x51/0xb0
 [81170116] vfs_stat+0x16/0x20
 [8117013f] sys_newstat+0x1f/0x50
 [81583129] system_call_fastpath+0x16/0x1b
 [] 0x
 22621 [8118325e] __wait_on_freeing_inode+0x9e/0xc0
 [81183321] find_inode_fast+0xa1/0xc0
 [8118529f] iget_locked+0x4f/0x180
 [811efa23] ext4_iget+0x33/0x9f0
 [811f6a5c] ext4_lookup+0xbc/0x160
 [81174ad0] lookup_real+0x20/0x60
 [81177e25] lookup_open+0x175/0x1d0
 [8117815e] do_last+0x2de/0x780
 [8117ae9a] path_openat+0xda/0x400
 [8117b303] do_filp_open+0x43/0xa0
 [81168ee0] do_sys_open+0x160/0x1e0
 [81168f9c] sys_open+0x1c/0x20
 [81583129] system_call_fastpath+0x16/0x1b
 [] 0x
 22711 [81178144] do_last+0x2c4/0x780
 [8117ae9a] path_openat+0xda/0x400
 [8117b303] do_filp_open+0x43/0xa0
 [81168ee0] do_sys_open+0x160/0x1e0
 [81168f9c] sys_open+0x1c/0x20
 [81583129] system_call_fastpath+0x16/0x1b
 [] 0x
 22946 [81178144] do_last+0x2c4/0x780
 [8117ae9a] path_openat+0xda/0x400
 [8117b303] do_filp_open+0x43/0xa0
 [81168ee0] do_sys_open+0x160/0x1e0
 [81168f9c] sys_open+0x1c/0x20
 [81583129] system_call_fastpath+0x16/0x1b

Re: linux-next: manual merge of the akpm tree with the ext4 tree

2013-06-19 Thread Glauber Costa

On Wed, Jun 19, 2013 at 05:27:21PM +1000, Stephen Rothwell wrote:
 Hi Andrew,
 
 Today's linux-next merge of the akpm tree got a conflict in
 fs/ext4/extents_status.c between commit 6480bad916be (ext4: improve
 extent cache shrink mechanism to avoid to burn CPU time) from the ext
 tree and commit 1f42d0934b4e (fs: convert fs shrinkers to new scan/count
 API) from the akpm tree.
 
 I fixed it up (I am not sure if the result makes complete sense - see
 below) and can carry the fix as necessary (no action is required).
 
 -- 
 Cheers,
 Stephen Rothwells...@canb.auug.org.au
 
 diff --cc fs/ext4/extents_status.c
 index 80dcc59,4bce4f0..000
 --- a/fs/ext4/extents_status.c
 +++ b/fs/ext4/extents_status.c
 @@@ -876,58 -878,32 +876,63 @@@ int ext4_es_zeroout(struct inode *inode
EXTENT_STATUS_WRITTEN);
   }
   
  +static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a,
  + struct list_head *b)
  +{
  +struct ext4_inode_info *eia, *eib;
  +unsigned long diff;
  +
  +eia = list_entry(a, struct ext4_inode_info, i_es_lru);
  +eib = list_entry(b, struct ext4_inode_info, i_es_lru);
  +
  +diff = eia-i_touch_when - eib-i_touch_when;
  +if (diff  0)
  +return -1;
  +if (diff  0)
  +return 1;
  +return 0;
  +}
   
This comes straight from ext4, so no problem.

Now we just need to make sure that the shrinker is clearly separated
between a count part and a scan part.

 - static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control 
 *sc)
 + static long ext4_es_count(struct shrinker *shrink, struct shrink_control 
 *sc)
 + {
 + long nr;
 + struct ext4_sb_info *sbi = container_of(shrink,
 + struct ext4_sb_info, s_es_shrinker);
 + 
 + nr = percpu_counter_read_positive(sbi-s_extent_cache_cnt);
 + trace_ext4_es_shrink_enter(sbi-s_sb, sc-nr_to_scan, nr);
 + return nr;
 + }
The count seems okay.

 + 
 + static long ext4_es_scan(struct shrinker *shrink, struct shrink_control *sc)
   {
   struct ext4_sb_info *sbi = container_of(shrink,
   struct ext4_sb_info, s_es_shrinker);
   struct ext4_inode_info *ei;
  -struct list_head *cur, *tmp, scanned;
  +struct list_head *cur, *tmp;
  +LIST_HEAD(skiped);
   int nr_to_scan = sc-nr_to_scan;
 - int ret, nr_shrunk = 0;
 - 
 - ret = percpu_counter_read_positive(sbi-s_extent_cache_cnt);
 - trace_ext4_es_shrink_enter(sbi-s_sb, nr_to_scan, ret);
 - 
 - if (!nr_to_scan)
 - return ret;
 + int ret = 0, nr_shrunk = 0;
   
  -INIT_LIST_HEAD(scanned);
  -
   spin_lock(sbi-s_es_lru_lock);
  +
  +/*
  + * If the inode that is at the head of LRU list is newer than
  + * last_sorted time, that means that we need to sort this list.
  + */
  +ei = list_first_entry(sbi-s_es_lru, struct ext4_inode_info, i_es_lru);
  +if (sbi-s_es_last_sorted  ei-i_touch_when) {
  +list_sort(NULL, sbi-s_es_lru, ext4_inode_touch_time_cmp);
  +sbi-s_es_last_sorted = jiffies;
  +}
  +
They are sorting the list, but that doesn't change the ammount of items they 
report.
So far it is fine. My only concern here would be locking, but it seems to be all
protected by the s_es_lru_lock.

   list_for_each_safe(cur, tmp, sbi-s_es_lru) {
  -list_move_tail(cur, scanned);
  +/*
  + * If we have already reclaimed all extents from extent
  + * status tree, just stop the loop immediately.
  + */
  +if (percpu_counter_read_positive(sbi-s_extent_cache_cnt) == 0)
  +break;
   
   ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
   
 @@@ -951,22 -923,22 +956,22 @@@
   if (nr_to_scan == 0)
   break;
   }
  -list_splice_tail(scanned, sbi-s_es_lru);
  +
  +/* Move the newer inodes into the tail of the LRU list. */
  +list_splice_tail(skiped, sbi-s_es_lru);
   spin_unlock(sbi-s_es_lru_lock);
   
 - ret = percpu_counter_read_positive(sbi-s_extent_cache_cnt);
   trace_ext4_es_shrink_exit(sbi-s_sb, nr_shrunk, ret);
 - return ret;
 + return nr_shrunk;
   }
   
This part also seems fine.

  -void ext4_es_register_shrinker(struct super_block *sb)
  +void ext4_es_register_shrinker(struct ext4_sb_info *sbi)
   {
  -struct ext4_sb_info *sbi;
  -
  -sbi = EXT4_SB(sb);
   INIT_LIST_HEAD(sbi-s_es_lru);
   spin_lock_init(sbi-s_es_lru_lock);
  +sbi-s_es_last_sorted = 0;
 - sbi-s_es_shrinker.shrink = ext4_es_shrink;
 + sbi-s_es_shrinker.scan_objects = ext4_es_scan;
 + sbi-s_es_shrinker.count_objects = ext4_es_count;
   sbi-s_es_shrinker.seeks = DEFAULT_SEEKS;
   register_shrinker(sbi-s_es_shrinker);
   }

And so does this.

I believe the resolution is okay, at least from our

Re: linux-next: slab shrinkers: BUG at mm/list_lru.c:92

2013-06-19 Thread Glauber Costa

On Wed, Jun 19, 2013 at 11:35:27AM +0400, Glauber Costa wrote:
 On Wed, Jun 19, 2013 at 09:13:46AM +0200, Michal Hocko wrote:
  On Tue 18-06-13 10:26:24, Glauber Costa wrote:
  [...]
   Michal, would you mind testing the following patch?
  
   diff --git a/fs/inode.c b/fs/inode.c
   index 00b804e..48eafa6 100644
   --- a/fs/inode.c
   +++ b/fs/inode.c
   @@ -419,6 +419,8 @@ void inode_add_lru(struct inode *inode)

static void inode_lru_list_del(struct inode *inode)
{
   + if (inode-i_state  I_FREEING)
   + return;

 if (list_lru_del(inode-i_sb-s_inode_lru, inode-i_lru))
 this_cpu_dec(nr_unused);
   @@ -609,8 +611,8 @@ void evict_inodes(struct super_block *sb)
 continue;
 }

   - inode-i_state |= I_FREEING;
 inode_lru_list_del(inode);
   + inode-i_state |= I_FREEING;
 spin_unlock(inode-i_lock);
 list_add(inode-i_lru, dispose);
 }
   @@ -653,8 +655,8 @@ int invalidate_inodes(struct super_block *sb, bool 
   kill_dirty)
 continue;
 }

   - inode-i_state |= I_FREEING;
 inode_lru_list_del(inode);
   + inode-i_state |= I_FREEING;
 spin_unlock(inode-i_lock);
 list_add(inode-i_lru, dispose);
 }
   @@ -1381,9 +1383,8 @@ static void iput_final(struct inode *inode)
 inode-i_state = ~I_WILL_FREE;
 }

   + inode_lru_list_del(inode);
 inode-i_state |= I_FREEING;
   - if (!list_empty(inode-i_lru))
   - inode_lru_list_del(inode);
 spin_unlock(inode-i_lock);

 evict(inode);
  
  No luck. I have this on top of inode_lru_isolate one but still can see
  hangs:
  911 [8118325e] __wait_on_freeing_inode+0x9e/0xc0
  [81183321] find_inode_fast+0xa1/0xc0
  [8118529f] iget_locked+0x4f/0x180
  [811efa23] ext4_iget+0x33/0x9f0
  [811f6a5c] ext4_lookup+0xbc/0x160
  [81174ad0] lookup_real+0x20/0x60
  [81175254] __lookup_hash+0x34/0x40
  [81179872] path_lookupat+0x7a2/0x830
  [81179933] filename_lookup+0x33/0xd0
  [8117ab0b] user_path_at_empty+0x7b/0xb0
  [8117ab4c] user_path_at+0xc/0x10
  [8116ff91] vfs_fstatat+0x51/0xb0
  [81170116] vfs_stat+0x16/0x20
  [8117013f] sys_newstat+0x1f/0x50
  [81583129] system_call_fastpath+0x16/0x1b
  [] 0x
  21409 [8118325e] __wait_on_freeing_inode+0x9e/0xc0
  [81183321] find_inode_fast+0xa1/0xc0
  [8118529f] iget_locked+0x4f/0x180
  [811efa23] ext4_iget+0x33/0x9f0
  [811f6a5c] ext4_lookup+0xbc/0x160
  [81174ad0] lookup_real+0x20/0x60
  [81177e25] lookup_open+0x175/0x1d0
  [8117815e] do_last+0x2de/0x780
  [8117ae9a] path_openat+0xda/0x400
  [8117b303] do_filp_open+0x43/0xa0
  [81168ee0] do_sys_open+0x160/0x1e0
  [81168f9c] sys_open+0x1c/0x20
  [81583129] system_call_fastpath+0x16/0x1b
  [] 0x
  21745 [81179862] path_lookupat+0x792/0x830
  [81179933] filename_lookup+0x33/0xd0
  [8117ab0b] user_path_at_empty+0x7b/0xb0
  [8117ab4c] user_path_at+0xc/0x10
  [8116ff91] vfs_fstatat+0x51/0xb0
  [81170116] vfs_stat+0x16/0x20
  [8117013f] sys_newstat+0x1f/0x50
  [81583129] system_call_fastpath+0x16/0x1b
  [] 0x
  22032 [81179862] path_lookupat+0x792/0x830
  [81179933] filename_lookup+0x33/0xd0
  [8117ab0b] user_path_at_empty+0x7b/0xb0
  [8117ab4c] user_path_at+0xc/0x10
  [8116ff91] vfs_fstatat+0x51/0xb0
  [81170116] vfs_stat+0x16/0x20
  [8117013f] sys_newstat+0x1f/0x50
  [81583129] system_call_fastpath+0x16/0x1b
  [] 0x
  22621 [8118325e] __wait_on_freeing_inode+0x9e/0xc0
  [81183321] find_inode_fast+0xa1/0xc0
  [8118529f] iget_locked+0x4f/0x180
  [811efa23] ext4_iget+0x33/0x9f0
  [811f6a5c] ext4_lookup+0xbc/0x160
  [81174ad0] lookup_real+0x20/0x60
  [81177e25] lookup_open+0x175/0x1d0
  [8117815e] do_last+0x2de/0x780
  [8117ae9a] path_openat+0xda/0x400
  [8117b303] do_filp_open+0x43/0xa0
  [81168ee0] do_sys_open+0x160/0x1e0
  [81168f9c] sys_open+0x1c/0x20
  [81583129] system_call_fastpath+0x16/0x1b
  [] 0x
  22711 [81178144] do_last+0x2c4/0x780
  [8117ae9a] path_openat+0xda/0x400
  [8117b303] do_filp_open+0x43/0xa0
  [81168ee0] do_sys_open+0x160/0x1e0
  [81168f9c] sys_open+0x1c/0x20
  [81583129] system_call_fastpath+0x16/0x1b
  [] 0x
  22946 [81178144] do_last+0x2c4/0x780
  [8117ae9a] path_openat+0xda/0x400
  [8117b303] do_filp_open+0x43/0xa0

Re: linux-next: slab shrinkers: BUG at mm/list_lru.c:92

2013-06-19 Thread Glauber Costa

On Wed, Jun 19, 2013 at 03:57:16PM +0200, Michal Hocko wrote:
 On Wed 19-06-13 11:35:27, Glauber Costa wrote:
 [...]
  Sorry if you said that before Michal.
  
  But given the backtrace, are you sure this is LRU-related?
 
 No idea. I just know that my mm tree behaves correctly after the whole
 series has been reverted (58f6e0c8fb37e8e37d5ac17a61a53ac236c15047) and
 before the latest version of the patchset has been applied.
 
  You mentioned you bisected it but found nothing conclusive.
 
 Yes, but I was interested in crashes and not hangs so I will try it
 again.
 
 I really hope this is not just some stupidness in my tree.
 
Okay. Just looking at the stack trace you provided me it would be hard
to implicate us. But it is not totally unreasonable either since we touch
things around this. I would right now assign more probability to a tricky
bug than to some misconfiguration on your side.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: linux-next: manual merge of the akpm tree with the ext4 tree

2013-06-19 Thread Glauber Costa

On Wed, Jun 19, 2013 at 02:59:17PM -0400, Theodore Ts'o wrote:
 On Wed, Jun 19, 2013 at 10:06:35AM -0700, Andrew Morton wrote:
  
  Merge the ext4 change early, please.  The core shrinker changes aren't
  100% certain at this time - first they need to stop oopsing ;)
 
 Ack, sounds like a plan.
 
 Are they oopsing often enough that they are likely to interfere with
 running regression tests on linux-next?
 
I have only received one report so far. It will really depend on how likely it
is for other people to hit it - wether or not other people hit it easily is also
good info...
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: linux-next: slab shrinkers: BUG at mm/list_lru.c:92

2013-06-18 Thread Glauber Costa

On Tue, Jun 18, 2013 at 10:19:31AM +0200, Michal Hocko wrote:
> On Tue 18-06-13 02:30:05, Glauber Costa wrote:
> > On Mon, Jun 17, 2013 at 02:35:08PM -0700, Andrew Morton wrote:
> [...]
> > > The trace says shrink_slab_node->super_cache_scan->prune_icache_sb.  So
> > > it's inodes?
> > > 
> > Assuming there is no memory corruption of any sort going on , let's
> > check the code. nr_item is only manipulated in 3 places:
> > 
> > 1) list_lru_add, where it is increased
> > 2) list_lru_del, where it is decreased in case the user have voluntarily 
> > removed the
> >element from the list
> > 3) list_lru_walk_node, where an element is removing during shrink.
> > 
> > All three excerpts seem to be correctly locked, so something like this
> > indicates an imbalance.  Either the element was never added to the
> > list, or it was added, removed, and we didn't notice it. (Again, your
> > backing storage is not XFS, is it? If it is , we have another user to
> > look for)
> 
> No this is ext3. But I can try to test with xfs as well if it helps.
> [...]

XFS won't help this, on the contrary. The reason I asked is because XFS
uses list_lru for its internal structures as well. So it is actually preferred
if you are reproducing this without it, so we can at least isolate that part.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: linux-next: slab shrinkers: BUG at mm/list_lru.c:92

2013-06-18 Thread Glauber Costa

On Tue, Jun 18, 2013 at 12:46:23PM +1000, Dave Chinner wrote:
> On Tue, Jun 18, 2013 at 02:30:05AM +0400, Glauber Costa wrote:
> > On Mon, Jun 17, 2013 at 02:35:08PM -0700, Andrew Morton wrote:
> > > On Mon, 17 Jun 2013 19:14:12 +0400 Glauber Costa  
> > > wrote:
> > > 
> > > > > I managed to trigger:
> > > > > [ 1015.776029] kernel BUG at mm/list_lru.c:92!
> > > > > [ 1015.776029] invalid opcode:  [#1] SMP
> > > > > with Linux next (next-20130607) with 
> > > > > https://lkml.org/lkml/2013/6/17/203
> > > > > on top. 
> > > > > 
> > > > > This is obviously BUG_ON(nlru->nr_items < 0) and 
> > > > > 81122d0b:   48 85 c0test   %rax,%rax
> > > > > 81122d0e:   49 89 44 24 18  mov%rax,0x18(%r12)
> > > > > 81122d13:   0f 84 87 00 00 00   je 
> > > > > 81122da0 
> > > > > 81122d19:   49 83 7c 24 18 00   cmpq   $0x0,0x18(%r12)
> > > > > 81122d1f:   78 7b   js 
> > > > > 81122d9c 
> > > > > [...]
> > > > > 81122d9c:   0f 0b   ud2
> > > > > 
> > > > > RAX is -1UL.
> > > > Yes, fearing those kind of imbalances, we decided to leave the counter 
> > > > as a signed quantity
> > > > and BUG, instead of an unsigned quantity.
> > > > 
> > > > > 
> > > > > I assume that the current backtrace is of no use and it would most
> > > > > probably be some shrinker which doesn't behave.
> > > > > 
> > > > There are currently 3 users of list_lru in tree: dentries, inodes and 
> > > > xfs.
> > > > Assuming you are not using xfs, we are left with dentries and inodes.
> > > > 
> > > > The first thing to do is to find which one of them is misbehaving. You 
> > > > can try finding
> > > > this out by the address of the list_lru, and where it lays in the 
> > > > superblock.
> > > > 
> > > > Once we know each of them is misbehaving, then we'll have to figure out 
> > > > why.
> > > 
> > > The trace says shrink_slab_node->super_cache_scan->prune_icache_sb.  So
> > > it's inodes?
> > > 
> > Assuming there is no memory corruption of any sort going on , let's check 
> > the code.
> > nr_item is only manipulated in 3 places:
> > 
> > 1) list_lru_add, where it is increased
> > 2) list_lru_del, where it is decreased in case the user have voluntarily 
> > removed the
> >element from the list
> > 3) list_lru_walk_node, where an element is removing during shrink.
> > 
> > All three excerpts seem to be correctly locked, so something like this 
> > indicates an imbalance.
> 
> inode_lru_isolate() looks suspicious to me:
> 
> WARN_ON(inode->i_state & I_NEW);
> inode->i_state |= I_FREEING;
> spin_unlock(>i_lock);
> 
> list_move(>i_lru, freeable);
> this_cpu_dec(nr_unused);
>   return LRU_REMOVED;
> }
> 
> All the other cases where I_FREEING is set and the inode is removed
> from the LRU are completely done under the inode->i_lock. i.e. from
> an external POV, the state change to I_FREEING and removal from LRU
> are supposed to be atomic, but they are not here.
> 
> I'm not sure this is the source of the problem, but it definitely
> needs fixing.
> 
Yes, I missed that yesterday, but that does look suspicious to me as well.

Michal, if you can manually move this one inside the lock as well and see
if it fixes your problem as well... Otherwise I can send you a patch as well
so we don't get lost on what is patched and what is not.

Let us at least know if this is the problem.

> > callers:
> > iput_final, evict_inodes, invalidate_inodes.
> > Both evict_inodes and invalidate_inodes will do the following pattern:
> > 
> > inode->i_state |= I_FREEING;
> > 
> > inode_lru_list_del(inode);
> > spin_unlock(>i_lock);
> > list_add(>i_lru, );
> > 
> > IOW, they will remove the element from the LRU, and add it to the dispose 
> > list.
> > Both of them will also bail out if they see I_FREEING already set, so they 
> > are safe
> > against each other - because the flag is manipulated inside the lock.
>

Re: linux-next: slab shrinkers: BUG at mm/list_lru.c:92

2013-06-18 Thread Glauber Costa

On Tue, Jun 18, 2013 at 02:30:05AM +0400, Glauber Costa wrote:
> On Mon, Jun 17, 2013 at 02:35:08PM -0700, Andrew Morton wrote:
> > On Mon, 17 Jun 2013 19:14:12 +0400 Glauber Costa  wrote:
> > 
> > > > I managed to trigger:
> > > > [ 1015.776029] kernel BUG at mm/list_lru.c:92!
> > > > [ 1015.776029] invalid opcode:  [#1] SMP
> > > > with Linux next (next-20130607) with https://lkml.org/lkml/2013/6/17/203
> > > > on top. 
> > > > 
> > > > This is obviously BUG_ON(nlru->nr_items < 0) and 
> > > > 81122d0b:   48 85 c0test   %rax,%rax
> > > > 81122d0e:   49 89 44 24 18  mov%rax,0x18(%r12)
> > > > 81122d13:   0f 84 87 00 00 00   je 81122da0 
> > > > 
> > > > 81122d19:   49 83 7c 24 18 00   cmpq   $0x0,0x18(%r12)
> > > > 81122d1f:   78 7b   js 81122d9c 
> > > > 
> > > > [...]
> > > > 81122d9c:   0f 0b   ud2
> > > > 
> > > > RAX is -1UL.
> > > Yes, fearing those kind of imbalances, we decided to leave the counter as 
> > > a signed quantity
> > > and BUG, instead of an unsigned quantity.
> > > 
> > > > 
> > > > I assume that the current backtrace is of no use and it would most
> > > > probably be some shrinker which doesn't behave.
> > > > 
> > > There are currently 3 users of list_lru in tree: dentries, inodes and xfs.
> > > Assuming you are not using xfs, we are left with dentries and inodes.
> > > 
> > > The first thing to do is to find which one of them is misbehaving. You 
> > > can try finding
> > > this out by the address of the list_lru, and where it lays in the 
> > > superblock.
> > > 
> > > Once we know each of them is misbehaving, then we'll have to figure out 
> > > why.
> > 
> > The trace says shrink_slab_node->super_cache_scan->prune_icache_sb.  So
> > it's inodes?
> > 
> Assuming there is no memory corruption of any sort going on , let's check the 
> code.
> nr_item is only manipulated in 3 places:
> 
> 1) list_lru_add, where it is increased
> 2) list_lru_del, where it is decreased in case the user have voluntarily 
> removed the
>element from the list
> 3) list_lru_walk_node, where an element is removing during shrink.
> 
> All three excerpts seem to be correctly locked, so something like this 
> indicates an imbalance.
> Either the element was never added to the list, or it was added, removed, and 
> we didn't notice
> it. (Again, your backing storage is not XFS, is it? If it is , we have 
> another user to look for)
> 
> I will assume that Andrew is correct and this is inode related. list_lru_del 
> reads as follows:
> spin_lock(>lock);
> if (!list_empty(item)) { ... }
> 
> So one possibility is that we are manipulating this list outside this lock 
> somewhere. Going to
> inode.c... We always manipulate the LRU inside the lock, but the element is 
> not always in the LRU,
> if it is in a list. Could it be possible that the element is in the 
> dispose_list, and at the same
> time someone calls list_lru_del at it, creating the imbalance ?
> 
> callers:
> iput_final, evict_inodes, invalidate_inodes.
> Both evict_inodes and invalidate_inodes will do the following pattern:
> 
> inode->i_state |= I_FREEING;  
>   
> inode_lru_list_del(inode);
> spin_unlock(>i_lock);
> list_add(>i_lru, );
> 
> IOW, they will remove the element from the LRU, and add it to the dispose 
> list.
> Both of them will also bail out if they see I_FREEING already set, so they 
> are safe
> against each other - because the flag is manipulated inside the lock.
> 
> But how about iput_final? It seems to me that if we are calling iput_final at 
> the
> same time as the other two, this *could* happen (maybe there is some extra 
> protection
> that can be seen from Australia but not from here. Dave?)
> 
> Right now this is my best theory.
> 
> I am attaching a patch that should make a difference in case I am right.
> 
> 
> 

Which is obviously borked since I did not fix the other callers so to move 
I_FREEING
after lru del.

Michal, would you mind testing the following patch?

diff --git a/fs/inode.c b/fs/inode.c
index 00b804e..48eafa6 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -419,6 +419,8 @@ void inode_add_lru(struct ino

Re: linux-next: slab shrinkers: BUG at mm/list_lru.c:92

2013-06-18 Thread Glauber Costa

On Tue, Jun 18, 2013 at 02:30:05AM +0400, Glauber Costa wrote:
 On Mon, Jun 17, 2013 at 02:35:08PM -0700, Andrew Morton wrote:
  On Mon, 17 Jun 2013 19:14:12 +0400 Glauber Costa glom...@gmail.com wrote:
  
I managed to trigger:
[ 1015.776029] kernel BUG at mm/list_lru.c:92!
[ 1015.776029] invalid opcode:  [#1] SMP
with Linux next (next-20130607) with https://lkml.org/lkml/2013/6/17/203
on top. 

This is obviously BUG_ON(nlru-nr_items  0) and 
81122d0b:   48 85 c0test   %rax,%rax
81122d0e:   49 89 44 24 18  mov%rax,0x18(%r12)
81122d13:   0f 84 87 00 00 00   je 81122da0 
list_lru_walk_node+0x110
81122d19:   49 83 7c 24 18 00   cmpq   $0x0,0x18(%r12)
81122d1f:   78 7b   js 81122d9c 
list_lru_walk_node+0x10c
[...]
81122d9c:   0f 0b   ud2

RAX is -1UL.
   Yes, fearing those kind of imbalances, we decided to leave the counter as 
   a signed quantity
   and BUG, instead of an unsigned quantity.
   

I assume that the current backtrace is of no use and it would most
probably be some shrinker which doesn't behave.

   There are currently 3 users of list_lru in tree: dentries, inodes and xfs.
   Assuming you are not using xfs, we are left with dentries and inodes.
   
   The first thing to do is to find which one of them is misbehaving. You 
   can try finding
   this out by the address of the list_lru, and where it lays in the 
   superblock.
   
   Once we know each of them is misbehaving, then we'll have to figure out 
   why.
  
  The trace says shrink_slab_node-super_cache_scan-prune_icache_sb.  So
  it's inodes?
  
 Assuming there is no memory corruption of any sort going on , let's check the 
 code.
 nr_item is only manipulated in 3 places:
 
 1) list_lru_add, where it is increased
 2) list_lru_del, where it is decreased in case the user have voluntarily 
 removed the
element from the list
 3) list_lru_walk_node, where an element is removing during shrink.
 
 All three excerpts seem to be correctly locked, so something like this 
 indicates an imbalance.
 Either the element was never added to the list, or it was added, removed, and 
 we didn't notice
 it. (Again, your backing storage is not XFS, is it? If it is , we have 
 another user to look for)
 
 I will assume that Andrew is correct and this is inode related. list_lru_del 
 reads as follows:
 spin_lock(nlru-lock);
 if (!list_empty(item)) { ... }
 
 So one possibility is that we are manipulating this list outside this lock 
 somewhere. Going to
 inode.c... We always manipulate the LRU inside the lock, but the element is 
 not always in the LRU,
 if it is in a list. Could it be possible that the element is in the 
 dispose_list, and at the same
 time someone calls list_lru_del at it, creating the imbalance ?
 
 callers:
 iput_final, evict_inodes, invalidate_inodes.
 Both evict_inodes and invalidate_inodes will do the following pattern:
 
 inode-i_state |= I_FREEING;  
   
 inode_lru_list_del(inode);
 spin_unlock(inode-i_lock);
 list_add(inode-i_lru, dispose);
 
 IOW, they will remove the element from the LRU, and add it to the dispose 
 list.
 Both of them will also bail out if they see I_FREEING already set, so they 
 are safe
 against each other - because the flag is manipulated inside the lock.
 
 But how about iput_final? It seems to me that if we are calling iput_final at 
 the
 same time as the other two, this *could* happen (maybe there is some extra 
 protection
 that can be seen from Australia but not from here. Dave?)
 
 Right now this is my best theory.
 
 I am attaching a patch that should make a difference in case I am right.
 
 
 

Which is obviously borked since I did not fix the other callers so to move 
I_FREEING
after lru del.

Michal, would you mind testing the following patch?

diff --git a/fs/inode.c b/fs/inode.c
index 00b804e..48eafa6 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -419,6 +419,8 @@ void inode_add_lru(struct inode *inode)
 
 static void inode_lru_list_del(struct inode *inode)
 {
+   if (inode-i_state  I_FREEING)
+   return;
 
if (list_lru_del(inode-i_sb-s_inode_lru, inode-i_lru))
this_cpu_dec(nr_unused);
@@ -609,8 +611,8 @@ void evict_inodes(struct super_block *sb)
continue;
}
 
-   inode-i_state |= I_FREEING;
inode_lru_list_del(inode);
+   inode-i_state |= I_FREEING;
spin_unlock(inode-i_lock);
list_add(inode-i_lru, dispose);
}
@@ -653,8 +655,8 @@ int invalidate_inodes(struct super_block *sb, bool 
kill_dirty)
continue;
}
 
-   inode

Re: linux-next: slab shrinkers: BUG at mm/list_lru.c:92

2013-06-18 Thread Glauber Costa

On Tue, Jun 18, 2013 at 12:46:23PM +1000, Dave Chinner wrote:
 On Tue, Jun 18, 2013 at 02:30:05AM +0400, Glauber Costa wrote:
  On Mon, Jun 17, 2013 at 02:35:08PM -0700, Andrew Morton wrote:
   On Mon, 17 Jun 2013 19:14:12 +0400 Glauber Costa glom...@gmail.com 
   wrote:
   
 I managed to trigger:
 [ 1015.776029] kernel BUG at mm/list_lru.c:92!
 [ 1015.776029] invalid opcode:  [#1] SMP
 with Linux next (next-20130607) with 
 https://lkml.org/lkml/2013/6/17/203
 on top. 
 
 This is obviously BUG_ON(nlru-nr_items  0) and 
 81122d0b:   48 85 c0test   %rax,%rax
 81122d0e:   49 89 44 24 18  mov%rax,0x18(%r12)
 81122d13:   0f 84 87 00 00 00   je 
 81122da0 list_lru_walk_node+0x110
 81122d19:   49 83 7c 24 18 00   cmpq   $0x0,0x18(%r12)
 81122d1f:   78 7b   js 
 81122d9c list_lru_walk_node+0x10c
 [...]
 81122d9c:   0f 0b   ud2
 
 RAX is -1UL.
Yes, fearing those kind of imbalances, we decided to leave the counter 
as a signed quantity
and BUG, instead of an unsigned quantity.

 
 I assume that the current backtrace is of no use and it would most
 probably be some shrinker which doesn't behave.
 
There are currently 3 users of list_lru in tree: dentries, inodes and 
xfs.
Assuming you are not using xfs, we are left with dentries and inodes.

The first thing to do is to find which one of them is misbehaving. You 
can try finding
this out by the address of the list_lru, and where it lays in the 
superblock.

Once we know each of them is misbehaving, then we'll have to figure out 
why.
   
   The trace says shrink_slab_node-super_cache_scan-prune_icache_sb.  So
   it's inodes?
   
  Assuming there is no memory corruption of any sort going on , let's check 
  the code.
  nr_item is only manipulated in 3 places:
  
  1) list_lru_add, where it is increased
  2) list_lru_del, where it is decreased in case the user have voluntarily 
  removed the
 element from the list
  3) list_lru_walk_node, where an element is removing during shrink.
  
  All three excerpts seem to be correctly locked, so something like this 
  indicates an imbalance.
 
 inode_lru_isolate() looks suspicious to me:
 
 WARN_ON(inode-i_state  I_NEW);
 inode-i_state |= I_FREEING;
 spin_unlock(inode-i_lock);
 
 list_move(inode-i_lru, freeable);
 this_cpu_dec(nr_unused);
   return LRU_REMOVED;
 }
 
 All the other cases where I_FREEING is set and the inode is removed
 from the LRU are completely done under the inode-i_lock. i.e. from
 an external POV, the state change to I_FREEING and removal from LRU
 are supposed to be atomic, but they are not here.
 
 I'm not sure this is the source of the problem, but it definitely
 needs fixing.
 
Yes, I missed that yesterday, but that does look suspicious to me as well.

Michal, if you can manually move this one inside the lock as well and see
if it fixes your problem as well... Otherwise I can send you a patch as well
so we don't get lost on what is patched and what is not.

Let us at least know if this is the problem.

  callers:
  iput_final, evict_inodes, invalidate_inodes.
  Both evict_inodes and invalidate_inodes will do the following pattern:
  
  inode-i_state |= I_FREEING;
  
  inode_lru_list_del(inode);
  spin_unlock(inode-i_lock);
  list_add(inode-i_lru, dispose);
  
  IOW, they will remove the element from the LRU, and add it to the dispose 
  list.
  Both of them will also bail out if they see I_FREEING already set, so they 
  are safe
  against each other - because the flag is manipulated inside the lock.
  
  But how about iput_final? It seems to me that if we are calling iput_final 
  at the
  same time as the other two, this *could* happen (maybe there is some extra 
  protection
  that can be seen from Australia but not from here. Dave?)
 
 If I_FREEING is set before we enter iput_final(), then something
 else is screwed up. I_FREEING is only set once the last reference
 has gone away and we are killing the inode. All the other callers
 that set I_FREEING check that the reference count on the inode is
 zero before they set I_FREEING. Hence I_FREEING cannot be set on the
 transition of i_count from 1 to 0 when iput_final() is called. So
 the patch won't do anything to avoid the problem being seen.
 
Yes, but isn't things like evict_inodes and invalidate_inodes called at
umount time, for instance? Can't it be that we drop the last reference
to a valid in use inode while someone else is invalidating them all?

 Keep in mind that we this is actually a new warning on the count of
 inodes on the LRU - we never had a check that it didn't go

Re: linux-next: slab shrinkers: BUG at mm/list_lru.c:92

2013-06-18 Thread Glauber Costa

On Tue, Jun 18, 2013 at 10:19:31AM +0200, Michal Hocko wrote:
 On Tue 18-06-13 02:30:05, Glauber Costa wrote:
  On Mon, Jun 17, 2013 at 02:35:08PM -0700, Andrew Morton wrote:
 [...]
   The trace says shrink_slab_node-super_cache_scan-prune_icache_sb.  So
   it's inodes?
   
  Assuming there is no memory corruption of any sort going on , let's
  check the code. nr_item is only manipulated in 3 places:
  
  1) list_lru_add, where it is increased
  2) list_lru_del, where it is decreased in case the user have voluntarily 
  removed the
 element from the list
  3) list_lru_walk_node, where an element is removing during shrink.
  
  All three excerpts seem to be correctly locked, so something like this
  indicates an imbalance.  Either the element was never added to the
  list, or it was added, removed, and we didn't notice it. (Again, your
  backing storage is not XFS, is it? If it is , we have another user to
  look for)
 
 No this is ext3. But I can try to test with xfs as well if it helps.
 [...]

XFS won't help this, on the contrary. The reason I asked is because XFS
uses list_lru for its internal structures as well. So it is actually preferred
if you are reproducing this without it, so we can at least isolate that part.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: linux-next: slab shrinkers: BUG at mm/list_lru.c:92

2013-06-17 Thread Glauber Costa

On Mon, Jun 17, 2013 at 02:35:08PM -0700, Andrew Morton wrote:
> On Mon, 17 Jun 2013 19:14:12 +0400 Glauber Costa  wrote:
> 
> > > I managed to trigger:
> > > [ 1015.776029] kernel BUG at mm/list_lru.c:92!
> > > [ 1015.776029] invalid opcode:  [#1] SMP
> > > with Linux next (next-20130607) with https://lkml.org/lkml/2013/6/17/203
> > > on top. 
> > > 
> > > This is obviously BUG_ON(nlru->nr_items < 0) and 
> > > 81122d0b:   48 85 c0test   %rax,%rax
> > > 81122d0e:   49 89 44 24 18  mov%rax,0x18(%r12)
> > > 81122d13:   0f 84 87 00 00 00   je 81122da0 
> > > 
> > > 81122d19:   49 83 7c 24 18 00   cmpq   $0x0,0x18(%r12)
> > > 81122d1f:   78 7b   js 81122d9c 
> > > 
> > > [...]
> > > 81122d9c:   0f 0b   ud2
> > > 
> > > RAX is -1UL.
> > Yes, fearing those kind of imbalances, we decided to leave the counter as a 
> > signed quantity
> > and BUG, instead of an unsigned quantity.
> > 
> > > 
> > > I assume that the current backtrace is of no use and it would most
> > > probably be some shrinker which doesn't behave.
> > > 
> > There are currently 3 users of list_lru in tree: dentries, inodes and xfs.
> > Assuming you are not using xfs, we are left with dentries and inodes.
> > 
> > The first thing to do is to find which one of them is misbehaving. You can 
> > try finding
> > this out by the address of the list_lru, and where it lays in the 
> > superblock.
> > 
> > Once we know each of them is misbehaving, then we'll have to figure out why.
> 
> The trace says shrink_slab_node->super_cache_scan->prune_icache_sb.  So
> it's inodes?
> 
Assuming there is no memory corruption of any sort going on , let's check the 
code.
nr_item is only manipulated in 3 places:

1) list_lru_add, where it is increased
2) list_lru_del, where it is decreased in case the user have voluntarily 
removed the
   element from the list
3) list_lru_walk_node, where an element is removing during shrink.

All three excerpts seem to be correctly locked, so something like this 
indicates an imbalance.
Either the element was never added to the list, or it was added, removed, and 
we didn't notice
it. (Again, your backing storage is not XFS, is it? If it is , we have another 
user to look for)

I will assume that Andrew is correct and this is inode related. list_lru_del 
reads as follows:
spin_lock(>lock);
if (!list_empty(item)) { ... }

So one possibility is that we are manipulating this list outside this lock 
somewhere. Going to
inode.c... We always manipulate the LRU inside the lock, but the element is not 
always in the LRU,
if it is in a list. Could it be possible that the element is in the 
dispose_list, and at the same
time someone calls list_lru_del at it, creating the imbalance ?

callers:
iput_final, evict_inodes, invalidate_inodes.
Both evict_inodes and invalidate_inodes will do the following pattern:

inode->i_state |= I_FREEING;

inode_lru_list_del(inode);
spin_unlock(>i_lock);
list_add(>i_lru, );

IOW, they will remove the element from the LRU, and add it to the dispose list.
Both of them will also bail out if they see I_FREEING already set, so they are 
safe
against each other - because the flag is manipulated inside the lock.

But how about iput_final? It seems to me that if we are calling iput_final at 
the
same time as the other two, this *could* happen (maybe there is some extra 
protection
that can be seen from Australia but not from here. Dave?)

Right now this is my best theory.

I am attaching a patch that should make a difference in case I am right.

diff --git a/fs/inode.c b/fs/inode.c
index 00b804e..c46c92e 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -419,6 +419,8 @@ void inode_add_lru(struct inode *inode)

 static void inode_lru_list_del(struct inode *inode)
 {
+   if (inode->i_state & I_FREEING)
+   return;

if (list_lru_del(>i_sb->s_inode_lru, >i_lru))
this_cpu_dec(nr_unused);
@@ -1381,9 +1383,8 @@ static void iput_final(struct inode *inode)
inode->i_state &= ~I_WILL_FREE;
}

+   inode_lru_list_del(inode);
inode->i_state |= I_FREEING;
-   if (!list_empty(>i_lru))
-   inode_lru_list_del(inode);
spin_unlock(>i_lock);

evict(inode);

Re: linux-next: slab shrinkers: BUG at mm/list_lru.c:92

2013-06-17 Thread Glauber Costa

On Mon, Jun 17, 2013 at 05:33:02PM +0200, Michal Hocko wrote:
> On Mon 17-06-13 19:14:12, Glauber Costa wrote:
> > On Mon, Jun 17, 2013 at 04:18:22PM +0200, Michal Hocko wrote:
> > > Hi,
> > 
> > Hi,
> > 
> > > I managed to trigger:
> > > [ 1015.776029] kernel BUG at mm/list_lru.c:92!
> > > [ 1015.776029] invalid opcode:  [#1] SMP
> > > with Linux next (next-20130607) with https://lkml.org/lkml/2013/6/17/203
> > > on top. 
> > > 
> > > This is obviously BUG_ON(nlru->nr_items < 0) and 
> > > 81122d0b:   48 85 c0test   %rax,%rax
> > > 81122d0e:   49 89 44 24 18  mov%rax,0x18(%r12)
> > > 81122d13:   0f 84 87 00 00 00   je 81122da0 
> > > 
> > > 81122d19:   49 83 7c 24 18 00   cmpq   $0x0,0x18(%r12)
> > > 81122d1f:   78 7b   js 81122d9c 
> > > 
> > > [...]
> > > 81122d9c:   0f 0b   ud2
> > > 
> > > RAX is -1UL.
> >
> > Yes, fearing those kind of imbalances, we decided to leave the counter
> > as a signed quantity and BUG, instead of an unsigned quantity.
> > 
> > > I assume that the current backtrace is of no use and it would most
> > > probably be some shrinker which doesn't behave.
> > > 
> > There are currently 3 users of list_lru in tree: dentries, inodes and xfs.
> > Assuming you are not using xfs, we are left with dentries and inodes.
> > 
> > The first thing to do is to find which one of them is misbehaving. You
> > can try finding this out by the address of the list_lru, and where it
> > lays in the superblock.
> 
> I am not sure I understand. Care to prepare a debugging patch for me?
>  
> > Once we know each of them is misbehaving, then we'll have to figure
> > out why.
> > 
> > Any special filesystem workload ?
> 
> This is two parallel kernel builds with separate kernel trees running
> under 2 hard unlimitted groups (with 0 soft limit) followed by rm -rf
> source trees + drop caches. Sometimes I have to repeat this multiple
> times. I can also see some timer specific crashes which are most
> probably not related so I am getting back to my mm tree and will hope
> the tree is healthy.
> 
> I have seen some other traces as well (mentioning ext3 dput paths) but I
> cannot reproduce them anymore.
> 

Do you have those traces? If there is a bug in the ext3 dput, then it is
most likely the culprit. dput() is when we insert things into the LRU. So
if we are not fully inserting an element that we should have - and later
on try to remove it, we'll go negative.

Can we see those traces?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: linux-next: slab shrinkers: BUG at mm/list_lru.c:92

2013-06-17 Thread Glauber Costa

On Mon, Jun 17, 2013 at 04:18:22PM +0200, Michal Hocko wrote:
> Hi,

Hi,

> I managed to trigger:
> [ 1015.776029] kernel BUG at mm/list_lru.c:92!
> [ 1015.776029] invalid opcode:  [#1] SMP
> with Linux next (next-20130607) with https://lkml.org/lkml/2013/6/17/203
> on top. 
> 
> This is obviously BUG_ON(nlru->nr_items < 0) and 
> 81122d0b:   48 85 c0test   %rax,%rax
> 81122d0e:   49 89 44 24 18  mov%rax,0x18(%r12)
> 81122d13:   0f 84 87 00 00 00   je 81122da0 
> 
> 81122d19:   49 83 7c 24 18 00   cmpq   $0x0,0x18(%r12)
> 81122d1f:   78 7b   js 81122d9c 
> 
> [...]
> 81122d9c:   0f 0b   ud2
> 
> RAX is -1UL.
Yes, fearing those kind of imbalances, we decided to leave the counter as a 
signed quantity
and BUG, instead of an unsigned quantity.

> 
> I assume that the current backtrace is of no use and it would most
> probably be some shrinker which doesn't behave.
> 
There are currently 3 users of list_lru in tree: dentries, inodes and xfs.
Assuming you are not using xfs, we are left with dentries and inodes.

The first thing to do is to find which one of them is misbehaving. You can try 
finding
this out by the address of the list_lru, and where it lays in the superblock.

Once we know each of them is misbehaving, then we'll have to figure out why.

Any special filesystem workload ?


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: linux-next: slab shrinkers: BUG at mm/list_lru.c:92

2013-06-17 Thread Glauber Costa

On Mon, Jun 17, 2013 at 04:18:22PM +0200, Michal Hocko wrote:
 Hi,

Hi,

 I managed to trigger:
 [ 1015.776029] kernel BUG at mm/list_lru.c:92!
 [ 1015.776029] invalid opcode:  [#1] SMP
 with Linux next (next-20130607) with https://lkml.org/lkml/2013/6/17/203
 on top. 
 
 This is obviously BUG_ON(nlru-nr_items  0) and 
 81122d0b:   48 85 c0test   %rax,%rax
 81122d0e:   49 89 44 24 18  mov%rax,0x18(%r12)
 81122d13:   0f 84 87 00 00 00   je 81122da0 
 list_lru_walk_node+0x110
 81122d19:   49 83 7c 24 18 00   cmpq   $0x0,0x18(%r12)
 81122d1f:   78 7b   js 81122d9c 
 list_lru_walk_node+0x10c
 [...]
 81122d9c:   0f 0b   ud2
 
 RAX is -1UL.
Yes, fearing those kind of imbalances, we decided to leave the counter as a 
signed quantity
and BUG, instead of an unsigned quantity.

 
 I assume that the current backtrace is of no use and it would most
 probably be some shrinker which doesn't behave.
 
There are currently 3 users of list_lru in tree: dentries, inodes and xfs.
Assuming you are not using xfs, we are left with dentries and inodes.

The first thing to do is to find which one of them is misbehaving. You can try 
finding
this out by the address of the list_lru, and where it lays in the superblock.

Once we know each of them is misbehaving, then we'll have to figure out why.

Any special filesystem workload ?


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: linux-next: slab shrinkers: BUG at mm/list_lru.c:92

2013-06-17 Thread Glauber Costa

On Mon, Jun 17, 2013 at 05:33:02PM +0200, Michal Hocko wrote:
 On Mon 17-06-13 19:14:12, Glauber Costa wrote:
  On Mon, Jun 17, 2013 at 04:18:22PM +0200, Michal Hocko wrote:
   Hi,
  
  Hi,
  
   I managed to trigger:
   [ 1015.776029] kernel BUG at mm/list_lru.c:92!
   [ 1015.776029] invalid opcode:  [#1] SMP
   with Linux next (next-20130607) with https://lkml.org/lkml/2013/6/17/203
   on top. 
   
   This is obviously BUG_ON(nlru-nr_items  0) and 
   81122d0b:   48 85 c0test   %rax,%rax
   81122d0e:   49 89 44 24 18  mov%rax,0x18(%r12)
   81122d13:   0f 84 87 00 00 00   je 81122da0 
   list_lru_walk_node+0x110
   81122d19:   49 83 7c 24 18 00   cmpq   $0x0,0x18(%r12)
   81122d1f:   78 7b   js 81122d9c 
   list_lru_walk_node+0x10c
   [...]
   81122d9c:   0f 0b   ud2
   
   RAX is -1UL.
 
  Yes, fearing those kind of imbalances, we decided to leave the counter
  as a signed quantity and BUG, instead of an unsigned quantity.
  
   I assume that the current backtrace is of no use and it would most
   probably be some shrinker which doesn't behave.
   
  There are currently 3 users of list_lru in tree: dentries, inodes and xfs.
  Assuming you are not using xfs, we are left with dentries and inodes.
  
  The first thing to do is to find which one of them is misbehaving. You
  can try finding this out by the address of the list_lru, and where it
  lays in the superblock.
 
 I am not sure I understand. Care to prepare a debugging patch for me?
  
  Once we know each of them is misbehaving, then we'll have to figure
  out why.
  
  Any special filesystem workload ?
 
 This is two parallel kernel builds with separate kernel trees running
 under 2 hard unlimitted groups (with 0 soft limit) followed by rm -rf
 source trees + drop caches. Sometimes I have to repeat this multiple
 times. I can also see some timer specific crashes which are most
 probably not related so I am getting back to my mm tree and will hope
 the tree is healthy.
 
 I have seen some other traces as well (mentioning ext3 dput paths) but I
 cannot reproduce them anymore.
 

Do you have those traces? If there is a bug in the ext3 dput, then it is
most likely the culprit. dput() is when we insert things into the LRU. So
if we are not fully inserting an element that we should have - and later
on try to remove it, we'll go negative.

Can we see those traces?
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: linux-next: slab shrinkers: BUG at mm/list_lru.c:92

2013-06-17 Thread Glauber Costa

On Mon, Jun 17, 2013 at 02:35:08PM -0700, Andrew Morton wrote:
 On Mon, 17 Jun 2013 19:14:12 +0400 Glauber Costa glom...@gmail.com wrote:
 
   I managed to trigger:
   [ 1015.776029] kernel BUG at mm/list_lru.c:92!
   [ 1015.776029] invalid opcode:  [#1] SMP
   with Linux next (next-20130607) with https://lkml.org/lkml/2013/6/17/203
   on top. 
   
   This is obviously BUG_ON(nlru-nr_items  0) and 
   81122d0b:   48 85 c0test   %rax,%rax
   81122d0e:   49 89 44 24 18  mov%rax,0x18(%r12)
   81122d13:   0f 84 87 00 00 00   je 81122da0 
   list_lru_walk_node+0x110
   81122d19:   49 83 7c 24 18 00   cmpq   $0x0,0x18(%r12)
   81122d1f:   78 7b   js 81122d9c 
   list_lru_walk_node+0x10c
   [...]
   81122d9c:   0f 0b   ud2
   
   RAX is -1UL.
  Yes, fearing those kind of imbalances, we decided to leave the counter as a 
  signed quantity
  and BUG, instead of an unsigned quantity.
  
   
   I assume that the current backtrace is of no use and it would most
   probably be some shrinker which doesn't behave.
   
  There are currently 3 users of list_lru in tree: dentries, inodes and xfs.
  Assuming you are not using xfs, we are left with dentries and inodes.
  
  The first thing to do is to find which one of them is misbehaving. You can 
  try finding
  this out by the address of the list_lru, and where it lays in the 
  superblock.
  
  Once we know each of them is misbehaving, then we'll have to figure out why.
 
 The trace says shrink_slab_node-super_cache_scan-prune_icache_sb.  So
 it's inodes?
 
Assuming there is no memory corruption of any sort going on , let's check the 
code.
nr_item is only manipulated in 3 places:

1) list_lru_add, where it is increased
2) list_lru_del, where it is decreased in case the user have voluntarily 
removed the
   element from the list
3) list_lru_walk_node, where an element is removing during shrink.

All three excerpts seem to be correctly locked, so something like this 
indicates an imbalance.
Either the element was never added to the list, or it was added, removed, and 
we didn't notice
it. (Again, your backing storage is not XFS, is it? If it is , we have another 
user to look for)

I will assume that Andrew is correct and this is inode related. list_lru_del 
reads as follows:
spin_lock(nlru-lock);
if (!list_empty(item)) { ... }

So one possibility is that we are manipulating this list outside this lock 
somewhere. Going to
inode.c... We always manipulate the LRU inside the lock, but the element is not 
always in the LRU,
if it is in a list. Could it be possible that the element is in the 
dispose_list, and at the same
time someone calls list_lru_del at it, creating the imbalance ?

callers:
iput_final, evict_inodes, invalidate_inodes.
Both evict_inodes and invalidate_inodes will do the following pattern:

inode-i_state |= I_FREEING;

inode_lru_list_del(inode);
spin_unlock(inode-i_lock);
list_add(inode-i_lru, dispose);

IOW, they will remove the element from the LRU, and add it to the dispose list.
Both of them will also bail out if they see I_FREEING already set, so they are 
safe
against each other - because the flag is manipulated inside the lock.

But how about iput_final? It seems to me that if we are calling iput_final at 
the
same time as the other two, this *could* happen (maybe there is some extra 
protection
that can be seen from Australia but not from here. Dave?)

Right now this is my best theory.

I am attaching a patch that should make a difference in case I am right.




diff --git a/fs/inode.c b/fs/inode.c
index 00b804e..c46c92e 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -419,6 +419,8 @@ void inode_add_lru(struct inode *inode)
 
 static void inode_lru_list_del(struct inode *inode)
 {
+   if (inode-i_state  I_FREEING)
+   return;
 
if (list_lru_del(inode-i_sb-s_inode_lru, inode-i_lru))
this_cpu_dec(nr_unused);
@@ -1381,9 +1383,8 @@ static void iput_final(struct inode *inode)
inode-i_state = ~I_WILL_FREE;
}
 
+   inode_lru_list_del(inode);
inode-i_state |= I_FREEING;
-   if (!list_empty(inode-i_lru))
-   inode_lru_list_del(inode);
spin_unlock(inode-i_lock);
 
evict(inode);

Re: [PATCH 11/11] cgroup: use percpu refcnt for cgroup_subsys_states

2013-06-14 Thread Glauber Costa

On Fri, Jun 14, 2013 at 02:55:39PM +0200, Michal Hocko wrote:
> On Wed 12-06-13 21:04:58, Tejun Heo wrote:
> [...]
> > +/**
> > + * cgroup_destroy_locked - the first stage of cgroup destruction
> > + * @cgrp: cgroup to be destroyed
> > + *
> > + * css's make use of percpu refcnts whose killing latency shouldn't be
> > + * exposed to userland and are RCU protected.  Also, cgroup core needs to
> > + * guarantee that css_tryget() won't succeed by the time ->css_offline() is
> > + * invoked.  To satisfy all the requirements, destruction is implemented in
> > + * the following two steps.
> > + *
> > + * s1. Verify @cgrp can be destroyed and mark it dying.  Remove all
> > + * userland visible parts and start killing the percpu refcnts of
> > + * css's.  Set up so that the next stage will be kicked off once all
> > + * the percpu refcnts are confirmed to be killed.
> > + *
> > + * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
> > + * rest of destruction.  Once all cgroup references are gone, the
> > + * cgroup is RCU-freed.
> > + *
> > + * This function implements s1.  After this step, @cgrp is gone as far as
> > + * the userland is concerned and a new cgroup with the same name may be
> > + * created.  As cgroup doesn't care about the names internally, this
> > + * doesn't cause any problem.
> 
> Glauber is this asumption correct for kmem caches naming scheme?
> I guess it should, but I would rather be sure this won't blow up later
> specially when caches might live longer than css_offline.
> 

We append names to caches, but never the name alone, always (kmemcg_id:name)
So you can reuse the same name as many times as you want (from the kmemcg
perspective), provided you use different kmemcg_ids. Because kmemcg_ids are
dissociated from css_ids (partly because I was already seeing people talking
about wanting to free the css_ids earlier), we should not have any problems
with this.


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v3 5/9] memcg: use css_get/put when charging/uncharging kmem

2013-06-14 Thread Glauber Costa

On Fri, Jun 14, 2013 at 09:17:40AM +0800, Li Zefan wrote:
> >>  static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
> >>  {
> >> +  /*
> >> +   * We need to call css_get() first, because memcg_uncharge_kmem()
> >> +   * will call css_put() if it sees the memcg is dead.
> >> +   */
> >> +  smb_wmb();
> >>if (test_bit(KMEM_ACCOUNTED_ACTIVE, >kmem_account_flags))
> >>set_bit(KMEM_ACCOUNTED_DEAD, >kmem_account_flags);
> > 
> > I do not feel strongly about that but maybe open coding this in
> > mem_cgroup_css_offline would be even better. There is only single caller
> > and there is smaller chance somebody will use the function incorrectly
> > later on.
> > 
> > So I leave the decision on you because this doesn't matter much.
> > 
> 
> Yeah, it should go away soon. I'll post a patch after this patchset gets
> merged into -mm tree and then we can discuss there.
 
I don't care if it is open coded or not. If there is any strong preference
from anyone on this matter, feel free to change it.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v3 5/9] memcg: use css_get/put when charging/uncharging kmem

2013-06-14 Thread Glauber Costa

On Fri, Jun 14, 2013 at 09:17:40AM +0800, Li Zefan wrote:
   static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
   {
  +  /*
  +   * We need to call css_get() first, because memcg_uncharge_kmem()
  +   * will call css_put() if it sees the memcg is dead.
  +   */
  +  smb_wmb();
 if (test_bit(KMEM_ACCOUNTED_ACTIVE, memcg-kmem_account_flags))
 set_bit(KMEM_ACCOUNTED_DEAD, memcg-kmem_account_flags);
  
  I do not feel strongly about that but maybe open coding this in
  mem_cgroup_css_offline would be even better. There is only single caller
  and there is smaller chance somebody will use the function incorrectly
  later on.
  
  So I leave the decision on you because this doesn't matter much.
  
 
 Yeah, it should go away soon. I'll post a patch after this patchset gets
 merged into -mm tree and then we can discuss there.
 
I don't care if it is open coded or not. If there is any strong preference
from anyone on this matter, feel free to change it.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 11/11] cgroup: use percpu refcnt for cgroup_subsys_states

2013-06-14 Thread Glauber Costa

On Fri, Jun 14, 2013 at 02:55:39PM +0200, Michal Hocko wrote:
 On Wed 12-06-13 21:04:58, Tejun Heo wrote:
 [...]
  +/**
  + * cgroup_destroy_locked - the first stage of cgroup destruction
  + * @cgrp: cgroup to be destroyed
  + *
  + * css's make use of percpu refcnts whose killing latency shouldn't be
  + * exposed to userland and are RCU protected.  Also, cgroup core needs to
  + * guarantee that css_tryget() won't succeed by the time -css_offline() is
  + * invoked.  To satisfy all the requirements, destruction is implemented in
  + * the following two steps.
  + *
  + * s1. Verify @cgrp can be destroyed and mark it dying.  Remove all
  + * userland visible parts and start killing the percpu refcnts of
  + * css's.  Set up so that the next stage will be kicked off once all
  + * the percpu refcnts are confirmed to be killed.
  + *
  + * s2. Invoke -css_offline(), mark the cgroup dead and proceed with the
  + * rest of destruction.  Once all cgroup references are gone, the
  + * cgroup is RCU-freed.
  + *
  + * This function implements s1.  After this step, @cgrp is gone as far as
  + * the userland is concerned and a new cgroup with the same name may be
  + * created.  As cgroup doesn't care about the names internally, this
  + * doesn't cause any problem.
 
 Glauber is this asumption correct for kmem caches naming scheme?
 I guess it should, but I would rather be sure this won't blow up later
 specially when caches might live longer than css_offline.
 

We append names to caches, but never the name alone, always (kmemcg_id:name)
So you can reuse the same name as many times as you want (from the kmemcg
perspective), provided you use different kmemcg_ids. Because kmemcg_ids are
dissociated from css_ids (partly because I was already seeing people talking
about wanting to free the css_ids earlier), we should not have any problems
with this.


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v7 00/11] per-cgroup cpu-stat

2013-06-06 Thread Glauber Costa

On 06/06/2013 05:49 AM, Tejun Heo wrote:
> I generally agree with the approach but which tree is it based on?
> Can you please let me know the base commit so that I can review the
> series properly?
last week's -tip/master
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v7 00/11] per-cgroup cpu-stat

2013-06-06 Thread Glauber Costa

On 06/06/2013 05:49 AM, Tejun Heo wrote:
 I generally agree with the approach but which tree is it based on?
 Can you please let me know the base commit so that I can review the
 series properly?
last week's -tip/master
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] memcg: don't initialize kmem-cache destroying work for root caches

2013-05-30 Thread Glauber Costa

On 05/29/2013 06:48 AM, Glauber Costa wrote:
> On 05/29/2013 04:23 AM, Andrew Morton wrote:
>> On Tue, 14 May 2013 16:38:38 +0400 Andrey Vagin  wrote:
>>
>>> struct memcg_cache_params has a union. Different parts of this union are
>>> used for root and non-root caches. A part with destroying work is used only
>>> for non-root caches.
>>
>> That union is a bit dangerous.  Perhaps it would be better to do
>> something like
>>
>> --- a/include/linux/slab.h~a
>> +++ a/include/linux/slab.h
>> @@ -337,15 +337,17 @@ static __always_inline int kmalloc_size(
>>  struct memcg_cache_params {
>>  bool is_root_cache;
>>  union {
>> -struct kmem_cache *memcg_caches[0];
>> -struct {
>> +struct memcg_root_cache {
>> +struct kmem_cache *caches[0];
>> +} memcg_root_cache;
>> +struct memcg_child_cache {
>>  struct mem_cgroup *memcg;
>>  struct list_head list;
>>  struct kmem_cache *root_cache;
>>  bool dead;
>>  atomic_t nr_pages;
>>  struct work_struct destroy;
>> -};
>> +} memcg_child_cache;
>>  };
>>  };
>>
>> And then adopt the convention of selecting either memcg_root_cache or
>> memcg_child_cache at the highest level then passing the more strongly
>> typed pointer to callees.
>>
> 
> Since it is already creating problems, yes, I agree.
> 
> I will try to cook up something soon.
> 

There are other cleanups being requested as well. (Tejun claims that we
would be better off with locks than with barriers for the destruction
path). To avoid conflicting with the current shrinkers work - that is
very massive, and since none of those are pressing, I will try to tackle
both next week (on top of that, if possible)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] memcg: don't initialize kmem-cache destroying work for root caches

2013-05-30 Thread Glauber Costa

On 05/29/2013 06:48 AM, Glauber Costa wrote:
 On 05/29/2013 04:23 AM, Andrew Morton wrote:
 On Tue, 14 May 2013 16:38:38 +0400 Andrey Vagin ava...@openvz.org wrote:

 struct memcg_cache_params has a union. Different parts of this union are
 used for root and non-root caches. A part with destroying work is used only
 for non-root caches.

 That union is a bit dangerous.  Perhaps it would be better to do
 something like

 --- a/include/linux/slab.h~a
 +++ a/include/linux/slab.h
 @@ -337,15 +337,17 @@ static __always_inline int kmalloc_size(
  struct memcg_cache_params {
  bool is_root_cache;
  union {
 -struct kmem_cache *memcg_caches[0];
 -struct {
 +struct memcg_root_cache {
 +struct kmem_cache *caches[0];
 +} memcg_root_cache;
 +struct memcg_child_cache {
  struct mem_cgroup *memcg;
  struct list_head list;
  struct kmem_cache *root_cache;
  bool dead;
  atomic_t nr_pages;
  struct work_struct destroy;
 -};
 +} memcg_child_cache;
  };
  };

 And then adopt the convention of selecting either memcg_root_cache or
 memcg_child_cache at the highest level then passing the more strongly
 typed pointer to callees.

 
 Since it is already creating problems, yes, I agree.
 
 I will try to cook up something soon.
 

There are other cleanups being requested as well. (Tejun claims that we
would be better off with locks than with barriers for the destruction
path). To avoid conflicting with the current shrinkers work - that is
very massive, and since none of those are pressing, I will try to tackle
both next week (on top of that, if possible)
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v7 02/11] cgroup: implement CFTYPE_NO_PREFIX

2013-05-29 Thread Glauber Costa

From: Tejun Heo 

When cgroup files are created, cgroup core automatically prepends the
name of the subsystem as prefix.  This patch adds CFTYPE_NO_PREFIX
which disables the automatic prefix.

This will be used to deprecate cpuacct which will make cpu create and
serve the cpuacct files.

Signed-off-by: Tejun Heo 
Cc: Peter Zijlstra 
Cc: Glauber Costa 
---
 include/linux/cgroup.h | 1 +
 kernel/cgroup.c| 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 5047355..5a8a093 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -397,6 +397,7 @@ struct cgroup_map_cb {
 #define CFTYPE_ONLY_ON_ROOT(1U << 0)   /* only create on root cg */
 #define CFTYPE_NOT_ON_ROOT (1U << 1)   /* don't create on root cg */
 #define CFTYPE_INSANE  (1U << 2)   /* don't create if 
sane_behavior */
+#define CFTYPE_NO_PREFIX   (1U << 3)   /* skip subsys prefix */
 
 #define MAX_CFTYPE_NAME64
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2a99262..8bbeb4d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2681,7 +2681,8 @@ static int cgroup_add_file(struct cgroup *cgrp, struct 
cgroup_subsys *subsys,
umode_t mode;
char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
 
-   if (subsys && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
+   if (subsys && !(cft->flags & CFTYPE_NO_PREFIX) &&
+   !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
strcpy(name, subsys->name);
strcat(name, ".");
}
-- 
1.8.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v7 03/11] cgroup, sched: let cpu serve the same files as cpuacct

2013-05-29 Thread Glauber Costa

From: Tejun Heo 

cpuacct being on a separate hierarchy is one of the main cgroup
related complaints from scheduler side and the consensus seems to be

* Allowing cpuacct to be a separate controller was a mistake.  In
  general multiple controllers on the same type of resource should be
  avoided, especially accounting-only ones.

* Statistics provided by cpuacct are useful and should instead be
  served by cpu.

This patch makes cpu maintain and serve all cpuacct.* files and make
cgroup core ignore cpuacct if it's co-mounted with cpu.  This is a
step in deprecating cpuacct.  The next patch will allow disabling or
dropping cpuacct without affecting userland too much.

Note that this creates some discrepancies in /proc/cgroups and
/proc/PID/cgroup.  The co-mounted cpuacct won't be reflected correctly
there.  cpuacct will eventually be removed completely probably except
for the statistics filenames and I'd like to keep the amount of
compatbility hackery to minimum as much as possible.

The cpu statistics implementation isn't optimized in any way.  It's
mostly verbatim copy from cpuacct.  The goal is allowing quick
disabling and removal of CONFIG_CGROUP_CPUACCT and creating a base on
top of which cpu can implement proper optimization.

[ glommer: don't call *_charge in stop_task.c ]

Signed-off-by: Tejun Heo 
Signed-off-by: Glauber Costa 
Cc: Peter Zijlstra 
Cc: Michal Hocko 
Cc: Kay Sievers 
Cc: Lennart Poettering 
Cc: Dave Jones 
Cc: Ben Hutchings 
Cc: Paul Turner 
---
 kernel/cgroup.c|  13 
 kernel/sched/core.c| 173 +
 kernel/sched/cputime.c |  23 +++
 kernel/sched/fair.c|   1 +
 kernel/sched/rt.c  |   1 +
 kernel/sched/sched.h   |   7 ++
 6 files changed, 218 insertions(+)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 8bbeb4d..01c3ed0 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1256,6 +1256,19 @@ static int parse_cgroupfs_options(char *data, struct 
cgroup_sb_opts *opts)
}
 
/*
+* cpuacct is deprecated and cpu will serve the same stat files.
+* If co-mount with cpu is requested, ignore cpuacct.  Note that
+* this creates some discrepancies in /proc/cgroups and
+* /proc/PID/cgroup.
+*
+* https://lkml.org/lkml/2012/9/13/542
+*/
+#if IS_ENABLED(CONFIG_CGROUP_SCHED) && IS_ENABLED(CONFIG_CGROUP_CPUACCT)
+   if ((opts->subsys_mask & (1 << cpu_cgroup_subsys_id)) &&
+   (opts->subsys_mask & (1 << cpuacct_subsys_id)))
+   opts->subsys_mask &= ~(1 << cpuacct_subsys_id);
+#endif
+   /*
 * Option noprefix was introduced just for backward compatibility
 * with the old cpuset, so we allow noprefix only if mounting just
 * the cpuset subsystem.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 36f85be..5ae1adf 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6327,6 +6327,7 @@ int in_sched_functions(unsigned long addr)
  */
 struct task_group root_task_group;
 LIST_HEAD(task_groups);
+static DEFINE_PER_CPU(u64, root_tg_cpuusage);
 #endif
 
 DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
@@ -6385,6 +6386,8 @@ void __init sched_init(void)
 #endif /* CONFIG_RT_GROUP_SCHED */
 
 #ifdef CONFIG_CGROUP_SCHED
+   root_task_group.cpustat = _cpustat;
+   root_task_group.cpuusage = _tg_cpuusage;
list_add(_task_group.list, _groups);
INIT_LIST_HEAD(_task_group.children);
INIT_LIST_HEAD(_task_group.siblings);
@@ -6665,6 +6668,8 @@ static void free_sched_group(struct task_group *tg)
free_fair_sched_group(tg);
free_rt_sched_group(tg);
autogroup_free(tg);
+   free_percpu(tg->cpuusage);
+   free_percpu(tg->cpustat);
kfree(tg);
 }
 
@@ -6677,6 +6682,11 @@ struct task_group *sched_create_group(struct task_group 
*parent)
if (!tg)
return ERR_PTR(-ENOMEM);
 
+   tg->cpuusage = alloc_percpu(u64);
+   tg->cpustat = alloc_percpu(struct kernel_cpustat);
+   if (!tg->cpuusage || !tg->cpustat)
+   goto err;
+
if (!alloc_fair_sched_group(tg, parent))
goto err;
 
@@ -6776,6 +6786,24 @@ void sched_move_task(struct task_struct *tsk)
 
task_rq_unlock(rq, tsk, );
 }
+
+void task_group_charge(struct task_struct *tsk, u64 cputime)
+{
+   struct task_group *tg;
+   int cpu = task_cpu(tsk);
+
+   rcu_read_lock();
+
+   tg = container_of(task_subsys_state(tsk, cpu_cgroup_subsys_id),
+ struct task_group, css);
+
+   for (; tg; tg = tg->parent) {
+   u64 *cpuusage = per_cpu_ptr(tg->cpuusage, cpu);
+   *cpuusage += cputime;
+   }
+
+   rcu_read_unlock();
+}
 #endif /* CONFIG_CGROUP_SCHED */
 
 #if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
@@ -7171,6 +7199,134 @@ cpu_cgroup_exit(struct cgroup *

[PATCH v7 05/11] cpuacct: don't actually do anything.

2013-05-29 Thread Glauber Costa

All the information we have that is needed for cpuusage (and
cpuusage_percpu) is present in schedstats. It is already recorded
in a sane hierarchical way.

If we have CONFIG_SCHEDSTATS, we don't really need to do any extra
work. All former functions become empty inlines.

Signed-off-by: Glauber Costa 
Cc: Peter Zijlstra 
Cc: Michal Hocko 
Cc: Kay Sievers 
Cc: Lennart Poettering 
Cc: Dave Jones 
Cc: Ben Hutchings 
Cc: Paul Turner 
---
 kernel/sched/core.c  | 102 ++-
 kernel/sched/sched.h |  10 +++--
 2 files changed, 90 insertions(+), 22 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5ae1adf..0fa0f87 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6787,6 +6787,7 @@ void sched_move_task(struct task_struct *tsk)
task_rq_unlock(rq, tsk, );
 }
 
+#ifndef CONFIG_SCHEDSTATS
 void task_group_charge(struct task_struct *tsk, u64 cputime)
 {
struct task_group *tg;
@@ -6804,6 +6805,7 @@ void task_group_charge(struct task_struct *tsk, u64 
cputime)
 
rcu_read_unlock();
 }
+#endif
 #endif /* CONFIG_CGROUP_SCHED */
 
 #if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
@@ -7199,22 +7201,92 @@ cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup 
*old_cgrp,
sched_move_task(task);
 }
 
-static u64 task_group_cpuusage_read(struct task_group *tg, int cpu)
+/*
+ * Take rq->lock to make 64-bit write safe on 32-bit platforms.
+ */
+static inline void lock_rq_dword(int cpu)
 {
-   u64 *cpuusage = per_cpu_ptr(tg->cpuusage, cpu);
-   u64 data;
-
 #ifndef CONFIG_64BIT
-   /*
-* Take rq->lock to make 64-bit read safe on 32-bit platforms.
-*/
raw_spin_lock_irq(_rq(cpu)->lock);
-   data = *cpuusage;
+#endif
+}
+
+static inline void unlock_rq_dword(int cpu)
+{
+#ifndef CONFIG_64BIT
raw_spin_unlock_irq(_rq(cpu)->lock);
+#endif
+}
+
+#ifdef CONFIG_SCHEDSTATS
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static inline u64 cfs_exec_clock(struct task_group *tg, int cpu)
+{
+   return tg->cfs_rq[cpu]->exec_clock - tg->cfs_rq[cpu]->prev_exec_clock;
+}
+
+static inline void cfs_exec_clock_reset(struct task_group *tg, int cpu)
+{
+   tg->cfs_rq[cpu]->prev_exec_clock = tg->cfs_rq[cpu]->exec_clock;
+}
 #else
-   data = *cpuusage;
+static inline u64 cfs_exec_clock(struct task_group *tg, int cpu)
+{
+}
+
+static inline void cfs_exec_clock_reset(struct task_group *tg, int cpu)
+{
+}
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+static inline u64 rt_exec_clock(struct task_group *tg, int cpu)
+{
+   return tg->rt_rq[cpu]->exec_clock - tg->rt_rq[cpu]->prev_exec_clock;
+}
+
+static inline void rt_exec_clock_reset(struct task_group *tg, int cpu)
+{
+   tg->rt_rq[cpu]->prev_exec_clock = tg->rt_rq[cpu]->exec_clock;
+}
+#else
+static inline u64 rt_exec_clock(struct task_group *tg, int cpu)
+{
+   return 0;
+}
+
+static inline void rt_exec_clock_reset(struct task_group *tg, int cpu)
+{
+}
 #endif
 
+static u64 task_group_cpuusage_read(struct task_group *tg, int cpu)
+{
+   u64 ret = 0;
+
+   lock_rq_dword(cpu);
+   ret = cfs_exec_clock(tg, cpu) + rt_exec_clock(tg, cpu);
+   unlock_rq_dword(cpu);
+
+   return ret;
+}
+
+static void task_group_cpuusage_write(struct task_group *tg, int cpu, u64 val)
+{
+   lock_rq_dword(cpu);
+   cfs_exec_clock_reset(tg, cpu);
+   rt_exec_clock_reset(tg, cpu);
+   unlock_rq_dword(cpu);
+}
+#else
+static u64 task_group_cpuusage_read(struct task_group *tg, int cpu)
+{
+   u64 *cpuusage = per_cpu_ptr(tg->cpuusage, cpu);
+   u64 data;
+
+   lock_rq_dword(cpu);
+   data = *cpuusage;
+   unlock_rq_dword(cpu);
+
return data;
 }
 
@@ -7222,17 +7294,11 @@ static void task_group_cpuusage_write(struct task_group 
*tg, int cpu, u64 val)
 {
u64 *cpuusage = per_cpu_ptr(tg->cpuusage, cpu);
 
-#ifndef CONFIG_64BIT
-   /*
-* Take rq->lock to make 64-bit write safe on 32-bit platforms.
-*/
-   raw_spin_lock_irq(_rq(cpu)->lock);
+   lock_rq_dword(cpu);
*cpuusage = val;
-   raw_spin_unlock_irq(_rq(cpu)->lock);
-#else
-   *cpuusage = val;
-#endif
+   unlock_rq_dword(cpu);
 }
+#endif
 
 /* return total cpu usage (in nanoseconds) of a group */
 static u64 cpucg_cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b05dd84..0e5e795 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -710,8 +710,6 @@ static inline void set_task_rq(struct task_struct *p, 
unsigned int cpu)
 #endif
 }
 
-extern void task_group_charge(struct task_struct *tsk, u64 cputime);
-
 #else /* CONFIG_CGROUP_SCHED */
 
 static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
@@ -719,10 +717,14 @@ static inline struct task_group *task_group(struct 
task_struct *p)
 {
return NULL;
 }
-static inl

[PATCH v7 04/11] sched: adjust exec_clock to use it as cpu usage metric

2013-05-29 Thread Glauber Costa

exec_clock already provides per-group cpu usage metrics, and can be
reused by cpuacct in case cpu and cpuacct are comounted.

However, it is only provided by tasks in fair class. Doing the same for
rt is easy, and can be done in an already existing hierarchy loop. This
is an improvement over the independent hierarchy walk executed by
cpuacct.

Signed-off-by: Glauber Costa 
CC: Dave Jones 
CC: Ben Hutchings 
CC: Peter Zijlstra 
CC: Paul Turner 
CC: Lennart Poettering 
CC: Kay Sievers 
CC: Tejun Heo 
---
 kernel/sched/rt.c| 1 +
 kernel/sched/sched.h | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index e9f8dcd..4a21045 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -907,6 +907,7 @@ static void update_curr_rt(struct rq *rq)
 
for_each_sched_rt_entity(rt_se) {
rt_rq = rt_rq_of_se(rt_se);
+   schedstat_add(rt_rq, exec_clock, delta_exec);
 
if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
raw_spin_lock(_rq->rt_runtime_lock);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 765c687..b05dd84 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -254,6 +254,7 @@ struct cfs_rq {
unsigned int nr_running, h_nr_running;
 
u64 exec_clock;
+   u64 prev_exec_clock;
u64 min_vruntime;
 #ifndef CONFIG_64BIT
u64 min_vruntime_copy;
@@ -356,6 +357,8 @@ struct rt_rq {
struct plist_head pushable_tasks;
 #endif
int rt_throttled;
+   u64 exec_clock;
+   u64 prev_exec_clock;
u64 rt_time;
u64 rt_runtime;
/* Nests inside the rq lock: */
-- 
1.8.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v7 09/11] sched: record per-cgroup number of context switches

2013-05-29 Thread Glauber Costa

Context switches are, to this moment, a property of the runqueue. When
running containers, we would like to be able to present a separate
figure for each container (or cgroup, in this context).

The chosen way to accomplish this is to increment a per cfs_rq or
rt_rq, depending on the task, for each of the sched entities involved,
up to the parent. It is trivial to note that for the parent cgroup, we
always add 1 by doing this. Also, we are not introducing any hierarchy
walks in here. An already existent walk is reused.
There are, however, two main issues:

 1. the traditional context switch code only increment nr_switches when
 a different task is being inserted in the rq. Eventually, albeit not
 likely, we will pick the same task as before. Since for cfq and rt we
 only now which task will be next after the walk, we need to do the walk
 again, decrementing 1. Since this is by far not likely, it seems a fair
 price to pay.

 2. Those figures do not include switches from and to the idle or stop
 task. Those need to be recorded separately, which will happen in a
 follow up patch.

Signed-off-by: Glauber Costa 
CC: Peter Zijlstra 
CC: Paul Turner 
---
 kernel/sched/fair.c  | 18 ++
 kernel/sched/rt.c| 15 +--
 kernel/sched/sched.h |  3 +++
 3 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 414cf1c..b29e5dd 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3642,6 +3642,8 @@ pick_next_task_fair(struct rq *rq, struct task_struct 
*prev)
prev->sched_class->put_prev_task(rq, prev);
 
do {
+   if (likely(prev))
+   cfs_rq->nr_switches++;
se = pick_next_entity(cfs_rq);
set_next_entity(cfs_rq, se);
cfs_rq = group_cfs_rq(se);
@@ -3651,6 +3653,22 @@ pick_next_task_fair(struct rq *rq, struct task_struct 
*prev)
if (hrtick_enabled(rq))
hrtick_start_fair(rq, p);
 
+   /*
+* This condition is extremely unlikely, and most of the time will just
+* consist of this unlikely branch, which is extremely cheap. But we
+* still need to have it, because when we first loop through cfs_rq's,
+* we can't possibly know which task we will pick. The call to
+* set_next_entity above is not meant to mess up the tree in this case,
+* so this should give us the same chain, in the same order.
+*/
+   if (unlikely(p == prev)) {
+   se = >se;
+   for_each_sched_entity(se) {
+   cfs_rq = cfs_rq_of(se);
+   cfs_rq->nr_switches--;
+   }
+   }
+
return p;
 }
 
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 6c41fa1..894fc0b 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1326,13 +1326,16 @@ static struct sched_rt_entity 
*pick_next_rt_entity(struct rq *rq,
return next;
 }
 
-static struct task_struct *_pick_next_task_rt(struct rq *rq)
+static struct task_struct *
+_pick_next_task_rt(struct rq *rq, struct task_struct *prev)
 {
struct sched_rt_entity *rt_se;
struct task_struct *p;
struct rt_rq *rt_rq  = >rt;
 
do {
+   if (likely(prev))
+   rt_rq->rt_nr_switches++;
rt_se = pick_next_rt_entity(rq, rt_rq);
BUG_ON(!rt_se);
rt_rq = group_rt_rq(rt_se);
@@ -1341,6 +1344,14 @@ static struct task_struct *_pick_next_task_rt(struct rq 
*rq)
p = rt_task_of(rt_se);
p->se.exec_start = rq_clock_task(rq);
 
+   /* See fair.c for an explanation on this */
+   if (unlikely(p == prev)) {
+   for_each_sched_rt_entity(rt_se) {
+   rt_rq = rt_rq_of_se(rt_se);
+   rt_rq->rt_nr_switches--;
+   }
+   }
+
return p;
 }
 
@@ -1359,7 +1370,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
if (prev)
prev->sched_class->put_prev_task(rq, prev);
 
-   p = _pick_next_task_rt(rq);
+   p = _pick_next_task_rt(rq, prev);
 
/* The running task is never eligible for pushing */
if (p)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a39eb9b..39d3284 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -273,6 +273,7 @@ struct cfs_rq {
unsigned int nr_spread_over;
 #endif
 
+   u64 nr_switches;
 #ifdef CONFIG_SMP
 /*
  * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
@@ -342,6 +343,8 @@ static inline int rt_bandwidth_enabled(void)
 struct rt_rq {
struct rt_prio_array active;
unsigned int rt_nr_running;
+   u64 rt_nr_switches;
+
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
struct {
int curr; /* highest queued rt task prio */
-- 
1.8.1.4

--
To unsubscribe from this

[PATCH v7 10/11] sched: change nr_context_switches calculation.

2013-05-29 Thread Glauber Costa

This patch changes the calculation of nr_context_switches. The variable
"nr_switches" is now used to account for the number of transition to the
idle task, or stop task. It is removed from the schedule() path.

The total calculation can be made using the fact that the transitions to
fair and rt classes are recorded in the root_task_group. One can easily
derive the total figure by adding those quantities together.

Signed-off-by: Glauber Costa 
CC: Peter Zijlstra 
CC: Paul Turner 
---
 kernel/sched/core.c  | 17 +++--
 kernel/sched/idle_task.c |  3 +++
 kernel/sched/stop_task.c |  2 ++
 3 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3b1441f..892c828 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2031,13 +2031,27 @@ unsigned long nr_running(void)
return sum;
 }
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+#define cfs_nr_switches(tg, cpu) (tg)->cfs_rq[cpu]->nr_switches
+#else
+#define cfs_nr_switches(tg, cpu) cpu_rq(cpu)->cfs.nr_switches
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+#define rt_nr_switches(tg, cpu) (tg)->rt_rq[cpu]->rt_nr_switches
+#else
+#define rt_nr_switches(tg, cpu) cpu_rq(cpu)->rt.rt_nr_switches
+#endif
+
 unsigned long long nr_context_switches(void)
 {
int i;
unsigned long long sum = 0;
 
-   for_each_possible_cpu(i)
+   for_each_possible_cpu(i) {
+   sum += cfs_nr_switches(_task_group, i);
+   sum += rt_nr_switches(_task_group, i);
sum += cpu_rq(i)->nr_switches;
+   }
 
return sum;
 }
@@ -2417,7 +2431,6 @@ need_resched:
rq->skip_clock_update = 0;
 
if (likely(prev != next)) {
-   rq->nr_switches++;
rq->curr = next;
++*switch_count;
 
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 4f0a332..923286c 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -39,6 +39,9 @@ pick_next_task_idle(struct rq *rq, struct task_struct *prev)
if (prev)
prev->sched_class->put_prev_task(rq, prev);
 
+   if (prev != rq->idle)
+   rq->nr_switches++;
+
schedstat_inc(rq, sched_goidle);
 #ifdef CONFIG_SMP
/* Trigger the post schedule to do an idle_enter for CFS */
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 6adf6a9..48c572a 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -32,6 +32,8 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev)
stop->se.exec_start = rq_clock_task(rq);
if (prev)
prev->sched_class->put_prev_task(rq, prev);
+   if (prev != rq->stop)
+   rq->nr_switches++;
return stop;
}
 
-- 
1.8.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 1417 matches

Mail list logo