On Fri, Dec 16, 2011 at 12:43:52PM -0600, Nate Custer wrote:
> 
> On Dec 15, 2011, at 1:47 PM, Vivek Goyal wrote:
> > Ok, I continued to develop on the patch which tries to allocate per cpu
> > stats from worker thread context. Here is the patch.
> > 
> > Can the reporter please try out the patch and see if it helps. I am not
> > sure if deadlock was because of mutex issue or not, but it should help
> > get rid of lockdep warning.
> > 
> > This patch is on top of for-3.3/core branch of jens's linux-block tree.
> > If it does not apply on your kernel version, do let me know the version 
> > you are testing with and I will generate another version of patch.
> > 
> > If testing results are good, I will break down the patch in small pieces
> > and post as a series separately.
> > 
> > Thanks
> > Vivek
> 
> Running on a fedora-16 box with the patch applied to the linux-block tree I 
> still had deadlocks. In fact it seemed to happen much faster and with ligher 
> workloads.
> 
> I was able to get netconsole setup and a full stacktrace is posted here:
> 
> http://pastebin.com/9Rq68exU

Thanks for testing it Nate. I did some debugging and found out that patch
is doing double free on per cpu pointer hence the crash you are running
into. I could reproduce this problem on my box. It is just a matter of
doing rmdir on the blkio cgroup.

I understood the cmpxchg() semantics wrong. I have fixed it now and
no crashes on directory removal. Can you please give this version a
try.

Thanks
Vivek

Alloc per cpu data from a worker thread context to avoid possibility of a
deadlock under memory pressure.

Only side affect of delayed allocation is that we will lose the blkio cgroup
stats for that group a small duration.

This patch is generated on top of "for-3.3/core" branch of linux-block tree.

-v2:
 Fixed the issue of double free on percpu pointer.
---
 block/blk-cgroup.c   |   31 ++++++++++++-
 block/blk-cgroup.h   |    9 +++
 block/blk-throttle.c |  116 ++++++++++++++++++++++++++++++++-----------------
 block/cfq-iosched.c  |  119 ++++++++++++++++++++++++++++++---------------------
 4 files changed, 180 insertions(+), 95 deletions(-)

Index: block/blk-throttle.c
===================================================================
--- block/blk-throttle.c.orig   2011-12-17 01:49:34.000000000 -0500
+++ block/blk-throttle.c        2011-12-17 02:21:50.000000000 -0500
@@ -81,6 +81,7 @@ struct throtl_grp {
        int limits_changed;
 
        struct rcu_head rcu_head;
+       struct work_struct stat_alloc_work;
 };
 
 struct throtl_data
@@ -159,6 +160,13 @@ static void throtl_free_tg(struct rcu_he
        struct throtl_grp *tg;
 
        tg = container_of(head, struct throtl_grp, rcu_head);
+
+       /*
+        * Will delayed allocation be visible here for sure? I am relying
+        * on the fact that after blkg.stats_cpu assignment, we drop
+        * reference to group using atomic_dec() which should imply
+        * barrier
+        */
        free_percpu(tg->blkg.stats_cpu);
        kfree(tg);
 }
@@ -181,6 +189,48 @@ static void throtl_put_tg(struct throtl_
        call_rcu(&tg->rcu_head, throtl_free_tg);
 }
 
+static inline void throtl_check_schedule_pcpu_stat_alloc(struct throtl_grp 
*tg) {
+       if (tg->blkg.stats_cpu != NULL)
+               return;
+       /*
+        * Schedule the group per cpu stat allocation through worker
+        * thread
+        */
+       throtl_ref_get_tg(tg);
+       if (!schedule_work(&tg->stat_alloc_work)) {
+               /* Work is already scheduled by somebody */
+               throtl_put_tg(tg);
+       }
+}
+
+/* No locks taken. A reference to throtl_grp taken before invocation */
+static void tg_stat_alloc_fn(struct work_struct *work)
+{
+       struct throtl_grp *tg = container_of(work, struct throtl_grp,
+                                               stat_alloc_work);
+       void *stat_ptr = NULL;
+       unsigned long ret;
+
+       stat_ptr = blkio_alloc_blkg_stats_pcpu();
+
+       if (stat_ptr == NULL) {
+               /* If memory allocation failed, try again */
+               throtl_check_schedule_pcpu_stat_alloc(tg);
+               throtl_put_tg(tg);
+               return;
+       }
+
+       ret = blkio_cmpxchg_blkg_stats(&tg->blkg, 0,
+                                       (unsigned long)stat_ptr);
+
+       if (ret != 0) {
+               /* Somebody else got to it first */
+               free_percpu(stat_ptr);
+       }
+
+       throtl_put_tg(tg);
+}
+
 static void throtl_init_group(struct throtl_grp *tg)
 {
        INIT_HLIST_NODE(&tg->tg_node);
@@ -188,6 +238,7 @@ static void throtl_init_group(struct thr
        bio_list_init(&tg->bio_lists[0]);
        bio_list_init(&tg->bio_lists[1]);
        tg->limits_changed = false;
+       INIT_WORK(&tg->stat_alloc_work, tg_stat_alloc_fn);
 
        /* Practically unlimited BW */
        tg->bps[0] = tg->bps[1] = -1;
@@ -263,8 +314,25 @@ static void throtl_init_add_tg_lists(str
        throtl_add_group_to_td_list(td, tg);
 }
 
-/* Should be called without queue lock and outside of rcu period */
-static struct throtl_grp *throtl_alloc_tg(struct throtl_data *td)
+/* Allocates per cpu stats asynchronously with the help of worker thread */
+static struct throtl_grp *throtl_alloc_tg_async(struct throtl_data *td)
+{
+       struct throtl_grp *tg = NULL;
+
+       tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node);
+       if (!tg)
+               return NULL;
+
+       throtl_init_group(tg);
+       throtl_check_schedule_pcpu_stat_alloc(tg);
+       return tg;
+}
+
+/*
+ * Should be called without queue lock and outside of rcu period as per
+ * cpu alloc will block
+ */
+static struct throtl_grp *throtl_alloc_tg_sync(struct throtl_data *td)
 {
        struct throtl_grp *tg = NULL;
        int ret;
@@ -273,7 +341,7 @@ static struct throtl_grp *throtl_alloc_t
        if (!tg)
                return NULL;
 
-       ret = blkio_alloc_blkg_stats(&tg->blkg);
+       ret = blkio_alloc_set_blkg_stats(&tg->blkg);
 
        if (ret) {
                kfree(tg);
@@ -305,7 +373,7 @@ throtl_grp *throtl_find_tg(struct throtl
 
 static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
 {
-       struct throtl_grp *tg = NULL, *__tg = NULL;
+       struct throtl_grp *tg = NULL;
        struct blkio_cgroup *blkcg;
        struct request_queue *q = td->queue;
 
@@ -321,46 +389,12 @@ static struct throtl_grp * throtl_get_tg
                return tg;
        }
 
-       /*
-        * Need to allocate a group. Allocation of group also needs allocation
-        * of per cpu stats which in-turn takes a mutex() and can block. Hence
-        * we need to drop rcu lock and queue_lock before we call alloc.
-        */
-       rcu_read_unlock();
-       spin_unlock_irq(q->queue_lock);
-
-       tg = throtl_alloc_tg(td);
-
-       /* Group allocated and queue is still alive. take the lock */
-       spin_lock_irq(q->queue_lock);
-
-       /* Make sure @q is still alive */
-       if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
-               kfree(tg);
-               return NULL;
-       }
-
-       /*
-        * Initialize the new group. After sleeping, read the blkcg again.
-        */
-       rcu_read_lock();
-       blkcg = task_blkio_cgroup(current);
-
-       /*
-        * If some other thread already allocated the group while we were
-        * not holding queue lock, free up the group
-        */
-       __tg = throtl_find_tg(td, blkcg);
-
-       if (__tg) {
-               kfree(tg);
-               rcu_read_unlock();
-               return __tg;
-       }
+       tg = throtl_alloc_tg_async(td);
 
        /* Group allocation failed. Account the IO to root group */
        if (!tg) {
                tg = td->root_tg;
+               rcu_read_unlock();
                return tg;
        }
 
@@ -1254,7 +1288,7 @@ int blk_throtl_init(struct request_queue
 
        /* alloc and Init root group. */
        td->queue = q;
-       tg = throtl_alloc_tg(td);
+       tg = throtl_alloc_tg_sync(td);
 
        if (!tg) {
                kfree(td);
Index: block/blk-cgroup.c
===================================================================
--- block/blk-cgroup.c.orig     2011-12-17 01:49:34.000000000 -0500
+++ block/blk-cgroup.c  2011-12-17 01:49:35.000000000 -0500
@@ -400,6 +400,9 @@ void blkiocg_update_dispatch_stats(struc
        struct blkio_group_stats_cpu *stats_cpu;
        unsigned long flags;
 
+       if (blkg->stats_cpu == NULL)
+               return;
+
        /*
         * Disabling interrupts to provide mutual exclusion between two
         * writes on same cpu. It probably is not needed for 64bit. Not
@@ -446,6 +449,9 @@ void blkiocg_update_io_merged_stats(stru
        struct blkio_group_stats_cpu *stats_cpu;
        unsigned long flags;
 
+       if (blkg->stats_cpu == NULL)
+               return;
+
        /*
         * Disabling interrupts to provide mutual exclusion between two
         * writes on same cpu. It probably is not needed for 64bit. Not
@@ -465,9 +471,11 @@ EXPORT_SYMBOL_GPL(blkiocg_update_io_merg
 
 /*
  * This function allocates the per cpu stats for blkio_group. Should be called
- * from sleepable context as alloc_per_cpu() requires that.
+ * from sleepable context as alloc_per_cpu() requires that. percpu alloc does
+ * not take any flags and does GFP_KERNEL allocations. Don't use it from
+ * IO submission path which usually might require GFP_NOIO.
  */
-int blkio_alloc_blkg_stats(struct blkio_group *blkg)
+int blkio_alloc_set_blkg_stats(struct blkio_group *blkg)
 {
        /* Allocate memory for per cpu stats */
        blkg->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
@@ -475,7 +483,14 @@ int blkio_alloc_blkg_stats(struct blkio_
                return -ENOMEM;
        return 0;
 }
-EXPORT_SYMBOL_GPL(blkio_alloc_blkg_stats);
+EXPORT_SYMBOL_GPL(blkio_alloc_set_blkg_stats);
+
+void* blkio_alloc_blkg_stats_pcpu(void)
+{
+       /* Allocate memory for per cpu stats */
+       return alloc_percpu(struct blkio_group_stats_cpu);
+}
+EXPORT_SYMBOL_GPL(blkio_alloc_blkg_stats_pcpu);
 
 void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
                struct blkio_group *blkg, void *key, dev_t dev,
@@ -551,6 +566,12 @@ static void blkio_reset_stats_cpu(struct
 {
        struct blkio_group_stats_cpu *stats_cpu;
        int i, j, k;
+
+       /* blkg->stats_cpu can have delayed allocation */
+
+       if (!blkg->stats_cpu)
+               return;
+
        /*
         * Note: On 64 bit arch this should not be an issue. This has the
         * possibility of returning some inconsistent value on 32bit arch
@@ -673,6 +694,10 @@ static uint64_t blkio_read_stat_cpu(stru
        struct blkio_group_stats_cpu *stats_cpu;
        u64 val = 0, tval;
 
+       /* blkg->stats_cpu might not have been allocated yet */
+       if (blkg->stats_cpu == NULL)
+               return 0;
+
        for_each_possible_cpu(cpu) {
                unsigned int start;
                stats_cpu  = per_cpu_ptr(blkg->stats_cpu, cpu);
Index: block/blk-cgroup.h
===================================================================
--- block/blk-cgroup.h.orig     2011-12-17 01:49:34.000000000 -0500
+++ block/blk-cgroup.h  2011-12-17 01:49:35.000000000 -0500
@@ -310,7 +310,12 @@ extern struct blkio_cgroup *task_blkio_c
 extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
        struct blkio_group *blkg, void *key, dev_t dev,
        enum blkio_policy_id plid);
-extern int blkio_alloc_blkg_stats(struct blkio_group *blkg);
+extern int blkio_alloc_set_blkg_stats(struct blkio_group *blkg);
+extern void* blkio_alloc_blkg_stats_pcpu(void);
+
+#define blkio_cmpxchg_blkg_stats(blkg, oldval, newval) \
+       cmpxchg((unsigned long *)&(blkg)->stats_cpu, oldval, newval);
+
 extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
 extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
                                                void *key);
@@ -338,7 +343,7 @@ static inline void blkiocg_add_blkio_gro
                struct blkio_group *blkg, void *key, dev_t dev,
                enum blkio_policy_id plid) {}
 
-static inline int blkio_alloc_blkg_stats(struct blkio_group *blkg) { return 0; 
}
+static inline int blkio_alloc_set_blkg_stats(struct blkio_group *blkg) { 
return 0; }
 
 static inline int
 blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
Index: block/cfq-iosched.c
===================================================================
--- block/cfq-iosched.c.orig    2011-12-17 01:49:34.000000000 -0500
+++ block/cfq-iosched.c 2011-12-17 02:21:50.000000000 -0500
@@ -209,7 +209,12 @@ struct cfq_group {
        struct blkio_group blkg;
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
        struct hlist_node cfqd_node;
-       int ref;
+       /*
+        * blkg per cpu stat allocation code will need to put reference
+        * without having queue lock. Hence keep it atomic.
+        */
+       atomic_t ref;
+       struct work_struct stat_alloc_work;
 #endif
        /* number of requests that are on the dispatch list or inside driver */
        int dispatched;
@@ -1012,6 +1017,9 @@ static void cfq_group_served(struct cfq_
 }
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
+static void cfq_put_cfqg(struct cfq_group *cfqg);
+static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg);
+
 static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
 {
        if (blkg)
@@ -1054,6 +1062,52 @@ static void cfq_init_add_cfqg_lists(stru
        hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
 }
 
+static inline void cfq_check_schedule_pcpu_stat_alloc(struct cfq_group *cfqg) {
+       if (cfqg->blkg.stats_cpu != NULL)
+               return;
+
+       /*
+        * Schedule the group per cpu stat allocation through worker
+        * thread
+        */
+       cfq_ref_get_cfqg(cfqg);
+       if (!schedule_work(&cfqg->stat_alloc_work)) {
+               /* Work is already scheduled by somebody */
+               cfq_put_cfqg(cfqg);
+       }
+}
+
+/* No locks taken. A reference to cfq_group taken before invocation */
+static void cfqg_stat_alloc_fn(struct work_struct *work)
+{
+       struct cfq_group *cfqg = container_of(work, struct cfq_group,
+                                               stat_alloc_work);
+       void *stat_ptr = NULL;
+       unsigned long ret;
+
+       stat_ptr = blkio_alloc_blkg_stats_pcpu();
+
+       if (stat_ptr == NULL) {
+               /*
+                * Memory allocation failed. Try again. Should an upper limit
+                * be put on number of retries?
+                */
+               cfq_check_schedule_pcpu_stat_alloc(cfqg);
+               cfq_put_cfqg(cfqg);
+               return;
+       }
+
+       ret = blkio_cmpxchg_blkg_stats(&cfqg->blkg, 0,
+                                       (unsigned long)stat_ptr);
+
+       if (ret != 0) {
+               /* Somebody else got to it first */
+               free_percpu(stat_ptr);
+       }
+
+       cfq_put_cfqg(cfqg);
+}
+
 /*
  * Should be called from sleepable context. No request queue lock as per
  * cpu stats are allocated dynamically and alloc_percpu needs to be called
@@ -1062,7 +1116,7 @@ static void cfq_init_add_cfqg_lists(stru
 static struct cfq_group * cfq_alloc_cfqg(struct cfq_data *cfqd)
 {
        struct cfq_group *cfqg = NULL;
-       int i, j, ret;
+       int i, j;
        struct cfq_rb_root *st;
 
        cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
@@ -1072,6 +1126,7 @@ static struct cfq_group * cfq_alloc_cfqg
        for_each_cfqg_st(cfqg, i, j, st)
                *st = CFQ_RB_ROOT;
        RB_CLEAR_NODE(&cfqg->rb_node);
+       INIT_WORK(&cfqg->stat_alloc_work, cfqg_stat_alloc_fn);
 
        cfqg->ttime.last_end_request = jiffies;
 
@@ -1081,14 +1136,9 @@ static struct cfq_group * cfq_alloc_cfqg
         * elevator which will be dropped by either elevator exit
         * or cgroup deletion path depending on who is exiting first.
         */
-       cfqg->ref = 1;
-
-       ret = blkio_alloc_blkg_stats(&cfqg->blkg);
-       if (ret) {
-               kfree(cfqg);
-               return NULL;
-       }
+       atomic_set(&cfqg->ref, 1);
 
+       cfq_check_schedule_pcpu_stat_alloc(cfqg);
        return cfqg;
 }
 
@@ -1124,8 +1174,7 @@ cfq_find_cfqg(struct cfq_data *cfqd, str
 static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd)
 {
        struct blkio_cgroup *blkcg;
-       struct cfq_group *cfqg = NULL, *__cfqg = NULL;
-       struct request_queue *q = cfqd->queue;
+       struct cfq_group *cfqg = NULL;
 
        rcu_read_lock();
        blkcg = task_blkio_cgroup(current);
@@ -1135,41 +1184,13 @@ static struct cfq_group *cfq_get_cfqg(st
                return cfqg;
        }
 
-       /*
-        * Need to allocate a group. Allocation of group also needs allocation
-        * of per cpu stats which in-turn takes a mutex() and can block. Hence
-        * we need to drop rcu lock and queue_lock before we call alloc.
-        *
-        * Not taking any queue reference here and assuming that queue is
-        * around by the time we return. CFQ queue allocation code does
-        * the same. It might be racy though.
-        */
-
-       rcu_read_unlock();
-       spin_unlock_irq(q->queue_lock);
-
        cfqg = cfq_alloc_cfqg(cfqd);
-
-       spin_lock_irq(q->queue_lock);
-
-       rcu_read_lock();
-       blkcg = task_blkio_cgroup(current);
-
-       /*
-        * If some other thread already allocated the group while we were
-        * not holding queue lock, free up the group
-        */
-       __cfqg = cfq_find_cfqg(cfqd, blkcg);
-
-       if (__cfqg) {
-               kfree(cfqg);
+       if (!cfqg) {
+               cfqg = &cfqd->root_group;
                rcu_read_unlock();
-               return __cfqg;
+               return cfqg;
        }
 
-       if (!cfqg)
-               cfqg = &cfqd->root_group;
-
        cfq_init_add_cfqg_lists(cfqd, cfqg, blkcg);
        rcu_read_unlock();
        return cfqg;
@@ -1177,7 +1198,7 @@ static struct cfq_group *cfq_get_cfqg(st
 
 static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
 {
-       cfqg->ref++;
+       atomic_inc(&cfqg->ref);
        return cfqg;
 }
 
@@ -1189,7 +1210,7 @@ static void cfq_link_cfqq_cfqg(struct cf
 
        cfqq->cfqg = cfqg;
        /* cfqq reference on cfqg */
-       cfqq->cfqg->ref++;
+       atomic_inc(&cfqq->cfqg->ref);
 }
 
 static void cfq_put_cfqg(struct cfq_group *cfqg)
@@ -1197,9 +1218,8 @@ static void cfq_put_cfqg(struct cfq_grou
        struct cfq_rb_root *st;
        int i, j;
 
-       BUG_ON(cfqg->ref <= 0);
-       cfqg->ref--;
-       if (cfqg->ref)
+       BUG_ON(atomic_read(&cfqg->ref) <= 0);
+       if (!atomic_dec_and_test(&cfqg->ref))
                return;
        for_each_cfqg_st(cfqg, i, j, st)
                BUG_ON(!RB_EMPTY_ROOT(&st->rb));
@@ -4025,6 +4045,7 @@ static void *cfq_init_queue(struct reque
        cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT;
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
+       INIT_WORK(&cfqg->stat_alloc_work, cfqg_stat_alloc_fn);
        /*
         * Set root group reference to 2. One reference will be dropped when
         * all groups on cfqd->cfqg_list are being deleted during queue exit.
@@ -4032,9 +4053,9 @@ static void *cfq_init_queue(struct reque
         * group as it is statically allocated and gets destroyed when
         * throtl_data goes away.
         */
-       cfqg->ref = 2;
+       atomic_set(&cfqg->ref, 2);
 
-       if (blkio_alloc_blkg_stats(&cfqg->blkg)) {
+       if (blkio_alloc_set_blkg_stats(&cfqg->blkg)) {
                kfree(cfqg);
                kfree(cfqd);
                return NULL;
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to