[Devel] [PATCH 05/18] io-controller: Common hierarchical fair queuing code in elevaotor layer

Vivek Goyal Tue, 05 May 2009 13:41:45 -0700

This patch enables hierarchical fair queuing in common layer. It is
controlled by config option CONFIG_GROUP_IOSCHED.


Signed-off-by: Nauman Rafique <[email protected]>
Signed-off-by: Fabio Checconi <[email protected]>
Signed-off-by: Paolo Valente <[email protected]>
Signed-off-by: Aristeu Rozanski <[email protected]>
Signed-off-by: Vivek Goyal <[email protected]>
---
 block/blk-ioc.c               |    3 +
 block/elevator-fq.c           | 1037 +++++++++++++++++++++++++++++++++++++----
 block/elevator-fq.h           |  149 ++++++-
 block/elevator.c              |    6 +
 include/linux/blkdev.h        |    7 +-
 include/linux/cgroup_subsys.h |    7 +
 include/linux/iocontext.h     |    5 +
 init/Kconfig                  |    8 +
 8 files changed, 1127 insertions(+), 95 deletions(-)

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 012f065..8f0f6cf 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -95,6 +95,9 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
                spin_lock_init(&ret->lock);
                ret->ioprio_changed = 0;
                ret->ioprio = 0;
+#ifdef CONFIG_GROUP_IOSCHED
+               ret->cgroup_changed = 0;
+#endif
                ret->last_waited = jiffies; /* doesn't matter... */
                ret->nr_batch_requests = 0; /* because this is 0 */
                ret->aic = NULL;
diff --git a/block/elevator-fq.c b/block/elevator-fq.c
index 9f1fbb9..cdaa46f 100644
--- a/block/elevator-fq.c
+++ b/block/elevator-fq.c
@@ -24,6 +24,10 @@ static int elv_rate_sampling_window = HZ / 10;
 
 #define ELV_SLICE_SCALE                (5)
 #define ELV_HW_QUEUE_MIN       (5)
+
+#define IO_DEFAULT_GRP_WEIGHT  500
+#define IO_DEFAULT_GRP_CLASS   IOPRIO_CLASS_BE
+
 #define IO_SERVICE_TREE_INIT   ((struct io_service_tree)               \
                                { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
 
@@ -31,6 +35,7 @@ static inline struct io_queue *elv_close_cooperator(struct 
request_queue *q,
                                        struct io_queue *ioq, int probe);
 struct io_entity *bfq_lookup_next_entity(struct io_sched_data *sd,
                                                 int extract);
+void elv_release_ioq(struct elevator_queue *eq, struct io_queue **ioq_ptr);
 
 static inline int elv_prio_slice(struct elv_fq_data *efqd, int sync,
                                        unsigned short prio)
@@ -49,6 +54,73 @@ elv_prio_to_slice(struct elv_fq_data *efqd, struct io_queue 
*ioq)
 }
 
 /* Mainly the BFQ scheduling code Follows */
+#ifdef CONFIG_GROUP_IOSCHED
+#define for_each_entity(entity)        \
+       for (; entity != NULL; entity = entity->parent)
+
+#define for_each_entity_safe(entity, parent) \
+       for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
+
+
+struct io_entity *bfq_lookup_next_entity(struct io_sched_data *sd,
+                                                int extract);
+void elv_del_ioq_busy(struct elevator_queue *e, struct io_queue *ioq,
+                                       int requeue);
+void elv_activate_ioq(struct io_queue *ioq, int add_front);
+void elv_deactivate_ioq(struct elv_fq_data *efqd, struct io_queue *ioq,
+                                       int requeue);
+
+static int bfq_update_next_active(struct io_sched_data *sd)
+{
+       struct io_group *iog;
+       struct io_entity *entity, *next_active;
+
+       if (sd->active_entity != NULL)
+               /* will update/requeue at the end of service */
+               return 0;
+
+       /*
+        * NOTE: this can be improved in may ways, such as returning
+        * 1 (and thus propagating upwards the update) only when the
+        * budget changes, or caching the bfqq that will be scheduled
+        * next from this subtree.  By now we worry more about
+        * correctness than about performance...
+        */
+       next_active = bfq_lookup_next_entity(sd, 0);
+       sd->next_active = next_active;
+
+       if (next_active != NULL) {
+               iog = container_of(sd, struct io_group, sched_data);
+               entity = iog->my_entity;
+               if (entity != NULL)
+                       entity->budget = next_active->budget;
+       }
+
+       return 1;
+}
+
+static inline void bfq_check_next_active(struct io_sched_data *sd,
+                                        struct io_entity *entity)
+{
+       BUG_ON(sd->next_active != entity);
+}
+#else /* GROUP_IOSCHED */
+#define for_each_entity(entity)        \
+       for (; entity != NULL; entity = NULL)
+
+#define for_each_entity_safe(entity, parent) \
+       for (parent = NULL; entity != NULL; entity = parent)
+
+static inline int bfq_update_next_active(struct io_sched_data *sd)
+{
+       return 0;
+}
+
+static inline void bfq_check_next_active(struct io_sched_data *sd,
+                                        struct io_entity *entity)
+{
+}
+#endif
 
 /*
  * Shift for timestamp calculations.  This actually limits the maximum
@@ -295,16 +367,6 @@ static void bfq_active_insert(struct io_service_tree *st,
        bfq_update_active_tree(node);
 }
 
-/**
- * bfq_ioprio_to_weight - calc a weight from an ioprio.
- * @ioprio: the ioprio value to convert.
- */
-static bfq_weight_t bfq_ioprio_to_weight(int ioprio)
-{
-       WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);
-       return IOPRIO_BE_NR - ioprio;
-}
-
 void bfq_get_entity(struct io_entity *entity)
 {
        struct io_queue *ioq = io_entity_to_ioq(entity);
@@ -313,13 +375,6 @@ void bfq_get_entity(struct io_entity *entity)
                elv_get_ioq(ioq);
 }
 
-void bfq_init_entity(struct io_entity *entity, struct io_group *iog)
-{
-       entity->ioprio = entity->new_ioprio;
-       entity->ioprio_class = entity->new_ioprio_class;
-       entity->sched_data = &iog->sched_data;
-}
-
 /**
  * bfq_find_deepest - find the deepest node that an extraction can modify.
  * @node: the node being removed.
@@ -462,8 +517,10 @@ __bfq_entity_update_prio(struct io_service_tree *old_st,
        struct io_queue *ioq = io_entity_to_ioq(entity);
 
        if (entity->ioprio_changed) {
+               old_st->wsum -= entity->weight;
                entity->ioprio = entity->new_ioprio;
                entity->ioprio_class = entity->new_ioprio_class;
+               entity->weight = entity->new_weight;
                entity->ioprio_changed = 0;
 
                /*
@@ -475,9 +532,6 @@ __bfq_entity_update_prio(struct io_service_tree *old_st,
                        entity->budget = elv_prio_to_slice(efqd, ioq);
                }
 
-               old_st->wsum -= entity->weight;
-               entity->weight = bfq_ioprio_to_weight(entity->ioprio);
-
                /*
                 * NOTE: here we may be changing the weight too early,
                 * this will cause unfairness.  The correct approach
@@ -559,11 +613,8 @@ static void __bfq_activate_entity(struct io_entity 
*entity, int add_front)
        if (add_front) {
                struct io_entity *next_entity;
 
-               /*
-                * Determine the entity which will be dispatched next
-                * Use sd->next_active once hierarchical patch is applied
-                */
-               next_entity = bfq_lookup_next_entity(sd, 0);
+               /* Determine the entity which will be dispatched next */
+               next_entity = sd->next_active;
 
                if (next_entity && next_entity != entity) {
                        struct io_service_tree *new_st;
@@ -590,12 +641,27 @@ static void __bfq_activate_entity(struct io_entity 
*entity, int add_front)
 }
 
 /**
- * bfq_activate_entity - activate an entity.
+ * bfq_activate_entity - activate an entity and its ancestors if necessary.
  * @entity: the entity to activate.
+ * Activate @entity and all the entities on the path from it to the root.
  */
 void bfq_activate_entity(struct io_entity *entity, int add_front)
 {
-       __bfq_activate_entity(entity, add_front);
+       struct io_sched_data *sd;
+
+       for_each_entity(entity) {
+               __bfq_activate_entity(entity, add_front);
+
+               add_front = 0;
+               sd = entity->sched_data;
+               if (!bfq_update_next_active(sd))
+                       /*
+                        * No need to propagate the activation to the
+                        * upper entities, as they will be updated when
+                        * the active entity is rescheduled.
+                        */
+                       break;
+       }
 }
 
 /**
@@ -631,12 +697,16 @@ int __bfq_deactivate_entity(struct io_entity *entity, int 
requeue)
        else if (entity->tree != NULL)
                BUG();
 
+       if (was_active || sd->next_active == entity)
+               ret = bfq_update_next_active(sd);
+
        if (!requeue || !bfq_gt(entity->finish, st->vtime))
                bfq_forget_entity(st, entity);
        else
                bfq_idle_insert(st, entity);
 
        BUG_ON(sd->active_entity == entity);
+       BUG_ON(sd->next_active == entity);
 
        return ret;
 }
@@ -648,7 +718,46 @@ int __bfq_deactivate_entity(struct io_entity *entity, int 
requeue)
  */
 void bfq_deactivate_entity(struct io_entity *entity, int requeue)
 {
-       __bfq_deactivate_entity(entity, requeue);
+       struct io_sched_data *sd;
+       struct io_entity *parent;
+
+       for_each_entity_safe(entity, parent) {
+               sd = entity->sched_data;
+
+               if (!__bfq_deactivate_entity(entity, requeue))
+                       /*
+                        * The parent entity is still backlogged, and
+                        * we don't need to update it as it is still
+                        * under service.
+                        */
+                       break;
+
+               if (sd->next_active != NULL)
+                       /*
+                        * The parent entity is still backlogged and
+                        * the budgets on the path towards the root
+                        * need to be updated.
+                        */
+                       goto update;
+
+               /*
+                * If we reach there the parent is no more backlogged and
+                * we want to propagate the dequeue upwards.
+                */
+               requeue = 1;
+       }
+
+       return;
+
+update:
+       entity = parent;
+       for_each_entity(entity) {
+               __bfq_activate_entity(entity, 0);
+
+               sd = entity->sched_data;
+               if (!bfq_update_next_active(sd))
+                       break;
+       }
 }
 
 /**
@@ -765,8 +874,10 @@ struct io_entity *bfq_lookup_next_entity(struct 
io_sched_data *sd,
                entity = __bfq_lookup_next_entity(st);
                if (entity != NULL) {
                        if (extract) {
+                               bfq_check_next_active(sd, entity);
                                bfq_active_extract(st, entity);
                                sd->active_entity = entity;
+                               sd->next_active = NULL;
                        }
                        break;
                }
@@ -779,13 +890,768 @@ void entity_served(struct io_entity *entity, 
bfq_service_t served)
 {
        struct io_service_tree *st;
 
-       st = io_entity_service_tree(entity);
-       entity->service += served;
-       BUG_ON(st->wsum == 0);
-       st->vtime += bfq_delta(served, st->wsum);
-       bfq_forget_idle(st);
+       for_each_entity(entity) {
+               st = io_entity_service_tree(entity);
+               entity->service += served;
+               BUG_ON(st->wsum == 0);
+               st->vtime += bfq_delta(served, st->wsum);
+               bfq_forget_idle(st);
+       }
+}
+
+/*
+ * Release all the io group references to its async queues.
+ */
+void io_put_io_group_queues(struct elevator_queue *e, struct io_group *iog)
+{
+       int i, j;
+
+       for (i = 0; i < 2; i++)
+               for (j = 0; j < IOPRIO_BE_NR; j++)
+                       elv_release_ioq(e, &iog->async_queue[i][j]);
+
+       /* Free up async idle queue */
+       elv_release_ioq(e, &iog->async_idle_queue);
+}
+
+
+/* Mainly hierarchical grouping code */
+#ifdef CONFIG_GROUP_IOSCHED
+
+struct io_cgroup io_root_cgroup = {
+       .weight = IO_DEFAULT_GRP_WEIGHT,
+       .ioprio_class = IO_DEFAULT_GRP_CLASS,
+};
+
+void bfq_init_entity(struct io_entity *entity, struct io_group *iog)
+{
+       entity->ioprio = entity->new_ioprio;
+       entity->weight = entity->new_weight;
+       entity->ioprio_class = entity->new_ioprio_class;
+       entity->parent = iog->my_entity;
+       entity->sched_data = &iog->sched_data;
+}
+
+struct io_cgroup *cgroup_to_io_cgroup(struct cgroup *cgroup)
+{
+       return container_of(cgroup_subsys_state(cgroup, io_subsys_id),
+                           struct io_cgroup, css);
+}
+
+/*
+ * Search the bfq_group for bfqd into the hash table (by now only a list)
+ * of bgrp.  Must be called under rcu_read_lock().
+ */
+struct io_group *io_cgroup_lookup_group(struct io_cgroup *iocg, void *key)
+{
+       struct io_group *iog;
+       struct hlist_node *n;
+       void *__key;
+
+       hlist_for_each_entry_rcu(iog, n, &iocg->group_data, group_node) {
+               __key = rcu_dereference(iog->key);
+               if (__key == key)
+                       return iog;
+       }
+
+       return NULL;
+}
+
+struct io_group *io_lookup_io_group_current(struct request_queue *q)
+{
+       struct io_group *iog;
+       struct io_cgroup *iocg;
+       struct cgroup *cgroup;
+       struct elv_fq_data *efqd = &q->elevator->efqd;
+
+       cgroup = task_cgroup(current, io_subsys_id);
+       iocg = cgroup_to_io_cgroup(cgroup);
+       iog = io_cgroup_lookup_group(iocg, efqd);
+       return iog;
+}
+EXPORT_SYMBOL(io_lookup_io_group_current);
+
+void io_group_init_entity(struct io_cgroup *iocg, struct io_group *iog)
+{
+       struct io_entity *entity = &iog->entity;
+
+       entity->weight = entity->new_weight = iocg->weight;
+       entity->ioprio_class = entity->new_ioprio_class = iocg->ioprio_class;
+       entity->ioprio_changed = 1;
+       entity->my_sched_data = &iog->sched_data;
+}
+
+void io_group_set_parent(struct io_group *iog, struct io_group *parent)
+{
+       struct io_entity *entity;
+
+       BUG_ON(parent == NULL);
+       BUG_ON(iog == NULL);
+
+       entity = &iog->entity;
+       entity->parent = parent->my_entity;
+       entity->sched_data = &parent->sched_data;
+}
+
+/**
+ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
+ * @st: the service tree being flushed.
+ */
+void io_flush_idle_tree(struct io_service_tree *st)
+{
+       struct io_entity *entity = st->first_idle;
+
+       for (; entity != NULL; entity = st->first_idle)
+               __bfq_deactivate_entity(entity, 0);
+}
+
+#define SHOW_FUNCTION(__VAR)                                           \
+static u64 io_cgroup_##__VAR##_read(struct cgroup *cgroup,             \
+                                      struct cftype *cftype)           \
+{                                                                      \
+       struct io_cgroup *iocg;                                 \
+       u64 ret;                                                        \
+                                                                       \
+       if (!cgroup_lock_live_group(cgroup))                            \
+               return -ENODEV;                                         \
+                                                                       \
+       iocg = cgroup_to_io_cgroup(cgroup);                             \
+       spin_lock_irq(&iocg->lock);                                     \
+       ret = iocg->__VAR;                                              \
+       spin_unlock_irq(&iocg->lock);                                   \
+                                                                       \
+       cgroup_unlock();                                                \
+                                                                       \
+       return ret;                                                     \
+}
+
+SHOW_FUNCTION(weight);
+SHOW_FUNCTION(ioprio_class);
+#undef SHOW_FUNCTION
+
+#define STORE_FUNCTION(__VAR, __MIN, __MAX)                            \
+static int io_cgroup_##__VAR##_write(struct cgroup *cgroup,            \
+                                       struct cftype *cftype,          \
+                                       u64 val)                        \
+{                                                                      \
+       struct io_cgroup *iocg;                                 \
+       struct io_group *iog;                                           \
+       struct hlist_node *n;                                           \
+                                                                       \
+       if (val < (__MIN) || val > (__MAX))                             \
+               return -EINVAL;                                         \
+                                                                       \
+       if (!cgroup_lock_live_group(cgroup))                            \
+               return -ENODEV;                                         \
+                                                                       \
+       iocg = cgroup_to_io_cgroup(cgroup);                             \
+                                                                       \
+       spin_lock_irq(&iocg->lock);                                     \
+       iocg->__VAR = (unsigned long)val;                               \
+       hlist_for_each_entry(iog, n, &iocg->group_data, group_node) {   \
+               iog->entity.new_##__VAR = (unsigned long)val;           \
+               smp_wmb();                                              \
+               iog->entity.ioprio_changed = 1;                         \
+       }                                                               \
+       spin_unlock_irq(&iocg->lock);                                   \
+                                                                       \
+       cgroup_unlock();                                                \
+                                                                       \
+       return 0;                                                       \
+}
+
+STORE_FUNCTION(weight, 0, WEIGHT_MAX);
+STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE);
+#undef STORE_FUNCTION
+
+/**
+ * bfq_group_chain_alloc - allocate a chain of groups.
+ * @bfqd: queue descriptor.
+ * @cgroup: the leaf cgroup this chain starts from.
+ *
+ * Allocate a chain of groups starting from the one belonging to
+ * @cgroup up to the root cgroup.  Stop if a cgroup on the chain
+ * to the root has already an allocated group on @bfqd.
+ */
+struct io_group *io_group_chain_alloc(struct request_queue *q, void *key,
+                                       struct cgroup *cgroup)
+{
+       struct io_cgroup *iocg;
+       struct io_group *iog, *leaf = NULL, *prev = NULL;
+       gfp_t flags = GFP_ATOMIC |  __GFP_ZERO;
+
+       for (; cgroup != NULL; cgroup = cgroup->parent) {
+               iocg = cgroup_to_io_cgroup(cgroup);
+
+               iog = io_cgroup_lookup_group(iocg, key);
+               if (iog != NULL) {
+                       /*
+                        * All the cgroups in the path from there to the
+                        * root must have a bfq_group for bfqd, so we don't
+                        * need any more allocations.
+                        */
+                       break;
+               }
+
+               iog = kzalloc_node(sizeof(*iog), flags, q->node);
+               if (!iog)
+                       goto cleanup;
+
+               io_group_init_entity(iocg, iog);
+               iog->my_entity = &iog->entity;
+
+               if (leaf == NULL) {
+                       leaf = iog;
+                       prev = leaf;
+               } else {
+                       io_group_set_parent(prev, iog);
+                       /*
+                        * Build a list of allocated nodes using the bfqd
+                        * filed, that is still unused and will be initialized
+                        * only after the node will be connected.
+                        */
+                       prev->key = iog;
+                       prev = iog;
+               }
+       }
+
+       return leaf;
+
+cleanup:
+       while (leaf != NULL) {
+               prev = leaf;
+               leaf = leaf->key;
+               kfree(prev);
+       }
+
+       return NULL;
+}
+
+/**
+ * bfq_group_chain_link - link an allocatd group chain to a cgroup hierarchy.
+ * @bfqd: the queue descriptor.
+ * @cgroup: the leaf cgroup to start from.
+ * @leaf: the leaf group (to be associated to @cgroup).
+ *
+ * Try to link a chain of groups to a cgroup hierarchy, connecting the
+ * nodes bottom-up, so we can be sure that when we find a cgroup in the
+ * hierarchy that already as a group associated to @bfqd all the nodes
+ * in the path to the root cgroup have one too.
+ *
+ * On locking: the queue lock protects the hierarchy (there is a hierarchy
+ * per device) while the bfqio_cgroup lock protects the list of groups
+ * belonging to the same cgroup.
+ */
+void io_group_chain_link(struct request_queue *q, void *key,
+                               struct cgroup *cgroup,
+                               struct io_group *leaf,
+                               struct elv_fq_data *efqd)
+{
+       struct io_cgroup *iocg;
+       struct io_group *iog, *next, *prev = NULL;
+       unsigned long flags;
+
+       assert_spin_locked(q->queue_lock);
+
+       for (; cgroup != NULL && leaf != NULL; cgroup = cgroup->parent) {
+               iocg = cgroup_to_io_cgroup(cgroup);
+               next = leaf->key;
+
+               iog = io_cgroup_lookup_group(iocg, key);
+               BUG_ON(iog != NULL);
+
+               spin_lock_irqsave(&iocg->lock, flags);
+
+               rcu_assign_pointer(leaf->key, key);
+               hlist_add_head_rcu(&leaf->group_node, &iocg->group_data);
+               hlist_add_head(&leaf->elv_data_node, &efqd->group_list);
+
+               spin_unlock_irqrestore(&iocg->lock, flags);
+
+               prev = leaf;
+               leaf = next;
+       }
+
+       BUG_ON(cgroup == NULL && leaf != NULL);
+
+       if (cgroup != NULL && prev != NULL) {
+               iocg = cgroup_to_io_cgroup(cgroup);
+               iog = io_cgroup_lookup_group(iocg, key);
+               io_group_set_parent(prev, iog);
+       }
+}
+
+/**
+ * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup.
+ * @bfqd: queue descriptor.
+ * @cgroup: cgroup being searched for.
+ * @create: if set to 1, create the io group if it has not been created yet.
+ *
+ * Return a group associated to @bfqd in @cgroup, allocating one if
+ * necessary.  When a group is returned all the cgroups in the path
+ * to the root have a group associated to @bfqd.
+ *
+ * If the allocation fails, return the root group: this breaks guarantees
+ * but is a safe fallbak.  If this loss becames a problem it can be
+ * mitigated using the equivalent weight (given by the product of the
+ * weights of the groups in the path from @group to the root) in the
+ * root scheduler.
+ *
+ * We allocate all the missing nodes in the path from the leaf cgroup
+ * to the root and we connect the nodes only after all the allocations
+ * have been successful.
+ */
+struct io_group *io_find_alloc_group(struct request_queue *q,
+                       struct cgroup *cgroup, struct elv_fq_data *efqd,
+                       int create)
+{
+       struct io_cgroup *iocg = cgroup_to_io_cgroup(cgroup);
+       struct io_group *iog = NULL;
+       /* Note: Use efqd as key */
+       void *key = efqd;
+
+       iog = io_cgroup_lookup_group(iocg, key);
+       if (iog != NULL || !create)
+               return iog;
+
+       iog = io_group_chain_alloc(q, key, cgroup);
+       if (iog != NULL)
+               io_group_chain_link(q, key, cgroup, iog, efqd);
+
+       return iog;
+}
+
+/*
+ * Search for the io group current task belongs to. If create=1, then also
+ * create the io group if it is not already there.
+ */
+struct io_group *io_get_io_group(struct request_queue *q, int create)
+{
+       struct cgroup *cgroup;
+       struct io_group *iog;
+       struct elv_fq_data *efqd = &q->elevator->efqd;
+
+       rcu_read_lock();
+       cgroup = task_cgroup(current, io_subsys_id);
+       iog = io_find_alloc_group(q, cgroup, efqd, create);
+       if (!iog) {
+               if (create)
+                       iog = efqd->root_group;
+               else
+                       /*
+                        * bio merge functions doing lookup don't want to
+                        * map bio to root group by default
+                        */
+                       iog = NULL;
+       }
+       rcu_read_unlock();
+       return iog;
+}
+
+void io_free_root_group(struct elevator_queue *e)
+{
+       struct io_cgroup *iocg = &io_root_cgroup;
+       struct elv_fq_data *efqd = &e->efqd;
+       struct io_group *iog = efqd->root_group;
+
+       BUG_ON(!iog);
+       spin_lock_irq(&iocg->lock);
+       hlist_del_rcu(&iog->group_node);
+       spin_unlock_irq(&iocg->lock);
+       io_put_io_group_queues(e, iog);
+       kfree(iog);
+}
+
+struct io_group *io_alloc_root_group(struct request_queue *q,
+                                       struct elevator_queue *e, void *key)
+{
+       struct io_group *iog;
+       struct io_cgroup *iocg;
+       int i;
+
+       iog = kmalloc_node(sizeof(*iog), GFP_KERNEL | __GFP_ZERO, q->node);
+       if (iog == NULL)
+               return NULL;
+
+       iog->entity.parent = NULL;
+       for (i = 0; i < IO_IOPRIO_CLASSES; i++)
+               iog->sched_data.service_tree[i] = IO_SERVICE_TREE_INIT;
+
+       iocg = &io_root_cgroup;
+       spin_lock_irq(&iocg->lock);
+       rcu_assign_pointer(iog->key, key);
+       hlist_add_head_rcu(&iog->group_node, &iocg->group_data);
+       spin_unlock_irq(&iocg->lock);
+
+       return iog;
+}
+
+struct cftype bfqio_files[] = {
+       {
+               .name = "weight",
+               .read_u64 = io_cgroup_weight_read,
+               .write_u64 = io_cgroup_weight_write,
+       },
+       {
+               .name = "ioprio_class",
+               .read_u64 = io_cgroup_ioprio_class_read,
+               .write_u64 = io_cgroup_ioprio_class_write,
+       },
+};
+
+int iocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
+{
+       return cgroup_add_files(cgroup, subsys, bfqio_files,
+                               ARRAY_SIZE(bfqio_files));
+}
+
+struct cgroup_subsys_state *iocg_create(struct cgroup_subsys *subsys,
+                                               struct cgroup *cgroup)
+{
+       struct io_cgroup *iocg;
+
+       if (cgroup->parent != NULL) {
+               iocg = kzalloc(sizeof(*iocg), GFP_KERNEL);
+               if (iocg == NULL)
+                       return ERR_PTR(-ENOMEM);
+       } else
+               iocg = &io_root_cgroup;
+
+       spin_lock_init(&iocg->lock);
+       INIT_HLIST_HEAD(&iocg->group_data);
+       iocg->weight = IO_DEFAULT_GRP_WEIGHT;
+       iocg->ioprio_class = IO_DEFAULT_GRP_CLASS;
+
+       return &iocg->css;
+}
+
+/*
+ * We cannot support shared io contexts, as we have no mean to support
+ * two tasks with the same ioc in two different groups without major rework
+ * of the main cic/bfqq data structures.  By now we allow a task to change
+ * its cgroup only if it's the only owner of its ioc; the drawback of this
+ * behavior is that a group containing a task that forked using CLONE_IO
+ * will not be destroyed until the tasks sharing the ioc die.
+ */
+int iocg_can_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
+                           struct task_struct *tsk)
+{
+       struct io_context *ioc;
+       int ret = 0;
+
+       /* task_lock() is needed to avoid races with exit_io_context() */
+       task_lock(tsk);
+       ioc = tsk->io_context;
+       if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1)
+               /*
+                * ioc == NULL means that the task is either too young or
+                * exiting: if it has still no ioc the ioc can't be shared,
+                * if the task is exiting the attach will fail anyway, no
+                * matter what we return here.
+                */
+               ret = -EINVAL;
+       task_unlock(tsk);
+
+       return ret;
+}
+
+void iocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
+                        struct cgroup *prev, struct task_struct *tsk)
+{
+       struct io_context *ioc;
+
+       task_lock(tsk);
+       ioc = tsk->io_context;
+       if (ioc != NULL)
+               ioc->cgroup_changed = 1;
+       task_unlock(tsk);
+}
+
+/*
+ * Move the queue to the root group if it is active. This is needed when
+ * a cgroup is being deleted and all the IO is not done yet. This is not
+ * very good scheme as a user might get unfair share. This needs to be
+ * fixed.
+ */
+void io_ioq_move(struct elevator_queue *e, struct io_queue *ioq,
+                               struct io_group *iog)
+{
+       int busy, resume;
+       struct io_entity *entity = &ioq->entity;
+       struct elv_fq_data *efqd = &e->efqd;
+       struct io_service_tree *st = io_entity_service_tree(entity);
+
+       busy = elv_ioq_busy(ioq);
+       resume = !!ioq->nr_queued;
+
+       BUG_ON(resume && !entity->on_st);
+       BUG_ON(busy && !resume && entity->on_st && ioq != efqd->active_queue);
+
+       /*
+        * We could be moving an queue which is on idle tree of previous group
+        * What to do? I guess anyway this queue does not have any requests.
+        * just forget the entity and free up from idle tree.
+        *
+        * This needs cleanup. Hackish.
+        */
+       if (entity->tree == &st->idle) {
+               BUG_ON(atomic_read(&ioq->ref) < 2);
+               bfq_put_idle_entity(st, entity);
+       }
+
+       if (busy) {
+               BUG_ON(atomic_read(&ioq->ref) < 2);
+
+               if (!resume)
+                       elv_del_ioq_busy(e, ioq, 0);
+               else
+                       elv_deactivate_ioq(efqd, ioq, 0);
+       }
+
+       /*
+        * Here we use a reference to bfqg.  We don't need a refcounter
+        * as the cgroup reference will not be dropped, so that its
+        * destroy() callback will not be invoked.
+        */
+       entity->parent = iog->my_entity;
+       entity->sched_data = &iog->sched_data;
+
+       if (busy && resume)
+               elv_activate_ioq(ioq, 0);
+}
+EXPORT_SYMBOL(io_ioq_move);
+
+static void __io_destroy_group(struct elv_fq_data *efqd, struct io_group *iog)
+{
+       struct elevator_queue *eq;
+       struct io_entity *entity = iog->my_entity;
+       struct io_service_tree *st;
+       int i;
+
+       eq = container_of(efqd, struct elevator_queue, efqd);
+       hlist_del(&iog->elv_data_node);
+       __bfq_deactivate_entity(entity, 0);
+       io_put_io_group_queues(eq, iog);
+
+       for (i = 0; i < IO_IOPRIO_CLASSES; i++) {
+               st = iog->sched_data.service_tree + i;
+
+               /*
+                * The idle tree may still contain bfq_queues belonging
+                * to exited task because they never migrated to a different
+                * cgroup from the one being destroyed now.  Noone else
+                * can access them so it's safe to act without any lock.
+                */
+               io_flush_idle_tree(st);
+
+               BUG_ON(!RB_EMPTY_ROOT(&st->active));
+               BUG_ON(!RB_EMPTY_ROOT(&st->idle));
+       }
+
+       BUG_ON(iog->sched_data.next_active != NULL);
+       BUG_ON(iog->sched_data.active_entity != NULL);
+       BUG_ON(entity->tree != NULL);
+}
+
+/**
+ * bfq_destroy_group - destroy @bfqg.
+ * @bgrp: the bfqio_cgroup containing @bfqg.
+ * @bfqg: the group being destroyed.
+ *
+ * Destroy @bfqg, making sure that it is not referenced from its parent.
+ */
+static void io_destroy_group(struct io_cgroup *iocg, struct io_group *iog)
+{
+       struct elv_fq_data *efqd = NULL;
+       unsigned long uninitialized_var(flags);
+
+       /* Remove io group from cgroup list */
+       hlist_del(&iog->group_node);
+
+       /*
+        * io groups are linked in two lists. One list is maintained
+        * in elevator (efqd->group_list) and other is maintained
+        * per cgroup structure (iocg->group_data).
+        *
+        * While a cgroup is being deleted, elevator also might be
+        * exiting and both might try to cleanup the same io group
+        * so need to be little careful.
+        *
+        * Following code first accesses efqd under RCU to make sure
+        * iog->key is pointing to valid efqd and then takes the
+        * associated queue lock. After gettting the queue lock it
+        * again checks whether elevator exit path had alreday got
+        * hold of io group (iog->key == NULL). If yes, it does not
+        * try to free up async queues again or flush the idle tree.
+        */
+
+       rcu_read_lock();
+       efqd = rcu_dereference(iog->key);
+       if (efqd != NULL) {
+               spin_lock_irqsave(efqd->queue->queue_lock, flags);
+               if (iog->key == efqd)
+                       __io_destroy_group(efqd, iog);
+               spin_unlock_irqrestore(efqd->queue->queue_lock, flags);
+       }
+       rcu_read_unlock();
+
+       /*
+        * No need to defer the kfree() to the end of the RCU grace
+        * period: we are called from the destroy() callback of our
+        * cgroup, so we can be sure that noone is a) still using
+        * this cgroup or b) doing lookups in it.
+        */
+       kfree(iog);
+}
+
+void iocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
+{
+       struct io_cgroup *iocg = cgroup_to_io_cgroup(cgroup);
+       struct hlist_node *n, *tmp;
+       struct io_group *iog;
+
+       /*
+        * Since we are destroying the cgroup, there are no more tasks
+        * referencing it, and all the RCU grace periods that may have
+        * referenced it are ended (as the destruction of the parent
+        * cgroup is RCU-safe); bgrp->group_data will not be accessed by
+        * anything else and we don't need any synchronization.
+        */
+       hlist_for_each_entry_safe(iog, n, tmp, &iocg->group_data, group_node)
+               io_destroy_group(iocg, iog);
+
+       BUG_ON(!hlist_empty(&iocg->group_data));
+
+       kfree(iocg);
+}
+
+void io_disconnect_groups(struct elevator_queue *e)
+{
+       struct hlist_node *pos, *n;
+       struct io_group *iog;
+       struct elv_fq_data *efqd = &e->efqd;
+
+       hlist_for_each_entry_safe(iog, pos, n, &efqd->group_list,
+                                       elv_data_node) {
+               hlist_del(&iog->elv_data_node);
+
+               __bfq_deactivate_entity(iog->my_entity, 0);
+
+               /*
+                * Don't remove from the group hash, just set an
+                * invalid key.  No lookups can race with the
+                * assignment as bfqd is being destroyed; this
+                * implies also that new elements cannot be added
+                * to the list.
+                */
+               rcu_assign_pointer(iog->key, NULL);
+               io_put_io_group_queues(e, iog);
+       }
+}
+
+struct cgroup_subsys io_subsys = {
+       .name = "io",
+       .create = iocg_create,
+       .can_attach = iocg_can_attach,
+       .attach = iocg_attach,
+       .destroy = iocg_destroy,
+       .populate = iocg_populate,
+       .subsys_id = io_subsys_id,
+};
+
+/*
+ * if bio sumbmitting task and rq don't belong to same io_group, it can't
+ * be merged
+ */
+int io_group_allow_merge(struct request *rq, struct bio *bio)
+{
+       struct request_queue *q = rq->q;
+       struct io_queue *ioq = rq->ioq;
+       struct io_group *iog, *__iog;
+
+       if (!elv_iosched_fair_queuing_enabled(q->elevator))
+               return 1;
+
+       /* Determine the io group of the bio submitting task */
+       iog = io_get_io_group(q, 0);
+       if (!iog) {
+               /* May be task belongs to a differet cgroup for which io
+                * group has not been setup yet. */
+               return 0;
+       }
+
+       /* Determine the io group of the ioq, rq belongs to*/
+       __iog = ioq_to_io_group(ioq);
+
+       return (iog == __iog);
+}
+
+/* find/create the io group request belongs to and put that info in rq */
+void elv_fq_set_request_io_group(struct request_queue *q,
+                                               struct request *rq)
+{
+       struct io_group *iog;
+       unsigned long flags;
+
+       /* Make sure io group hierarchy has been setup and also set the
+        * io group to which rq belongs. Later we should make use of
+        * bio cgroup patches to determine the io group */
+       spin_lock_irqsave(q->queue_lock, flags);
+       iog = io_get_io_group(q, 1);
+       spin_unlock_irqrestore(q->queue_lock, flags);
+       BUG_ON(!iog);
+
+       /* Store iog in rq. TODO: take care of referencing */
+       rq->iog = iog;
 }
 
+#else /* GROUP_IOSCHED */
+void bfq_init_entity(struct io_entity *entity, struct io_group *iog)
+{
+       entity->ioprio = entity->new_ioprio;
+       entity->weight = entity->new_weight;
+       entity->ioprio_class = entity->new_ioprio_class;
+       entity->sched_data = &iog->sched_data;
+}
+
+struct io_group *io_alloc_root_group(struct request_queue *q,
+                                       struct elevator_queue *e, void *key)
+{
+       struct io_group *iog;
+       int i;
+
+       iog = kmalloc_node(sizeof(*iog), GFP_KERNEL | __GFP_ZERO, q->node);
+       if (iog == NULL)
+               return NULL;
+
+       for (i = 0; i < IO_IOPRIO_CLASSES; i++)
+               iog->sched_data.service_tree[i] = IO_SERVICE_TREE_INIT;
+
+       return iog;
+}
+
+struct io_group *io_lookup_io_group_current(struct request_queue *q)
+{
+       struct elv_fq_data *efqd = &q->elevator->efqd;
+
+       return efqd->root_group;
+}
+EXPORT_SYMBOL(io_lookup_io_group_current);
+
+void io_free_root_group(struct elevator_queue *e)
+{
+       struct io_group *iog = e->efqd.root_group;
+       io_put_io_group_queues(e, iog);
+       kfree(iog);
+}
+
+struct io_group *io_get_io_group(struct request_queue *q, int create)
+{
+       return q->elevator->efqd.root_group;
+}
+
+#endif /* CONFIG_GROUP_IOSCHED*/
+
 /* Elevator fair queuing function */
 struct io_queue *rq_ioq(struct request *rq)
 {
@@ -1177,9 +2043,11 @@ EXPORT_SYMBOL(elv_put_ioq);
 
 void elv_release_ioq(struct elevator_queue *e, struct io_queue **ioq_ptr)
 {
+       struct io_group *root_group = e->efqd.root_group;
        struct io_queue *ioq = *ioq_ptr;
 
        if (ioq != NULL) {
+               io_ioq_move(e, ioq, root_group);
                /* Drop the reference taken by the io group */
                elv_put_ioq(ioq);
                *ioq_ptr = NULL;
@@ -1233,14 +2101,27 @@ struct io_queue *elv_get_next_ioq(struct request_queue 
*q, int extract)
                return NULL;
 
        sd = &efqd->root_group->sched_data;
-       if (extract)
-               entity = bfq_lookup_next_entity(sd, 1);
-       else
-               entity = bfq_lookup_next_entity(sd, 0);
+       for (; sd != NULL; sd = entity->my_sched_data) {
+               if (extract)
+                       entity = bfq_lookup_next_entity(sd, 1);
+               else
+                       entity = bfq_lookup_next_entity(sd, 0);
+
+               /*
+                * entity can be null despite the fact that there are busy
+                * queues. if all the busy queues are under a group which is
+                * currently under service.
+                * So if we are just looking for next ioq while something is
+                * being served, null entity is not an error.
+                */
+               BUG_ON(!entity && extract);
+
+               if (extract)
+                       entity->service = 0;
 
-       BUG_ON(!entity);
-       if (extract)
-               entity->service = 0;
+               if (!entity)
+                       return NULL;
+       }
        ioq = io_entity_to_ioq(entity);
 
        return ioq;
@@ -1256,8 +2137,12 @@ static void __elv_set_active_ioq(struct elv_fq_data 
*efqd, struct io_queue *ioq,
        struct request_queue *q = efqd->queue;
 
        if (ioq) {
-               elv_log_ioq(efqd, ioq, "set_active, busy=%d",
-                                                       efqd->busy_queues);
+               struct io_group *iog = ioq_to_io_group(ioq);
+               elv_log_ioq(efqd, ioq, "set_active, busy=%d ioprio=%d"
+                               " weight=%ld group_weight=%ld",
+                               efqd->busy_queues,
+                               ioq->entity.ioprio, ioq->entity.weight,
+                               iog_weight(iog));
                ioq->slice_end = 0;
 
                elv_clear_ioq_wait_request(ioq);
@@ -1492,6 +2377,7 @@ int elv_should_preempt(struct request_queue *q, struct 
io_queue *new_ioq,
 {
        struct io_queue *ioq;
        struct elevator_queue *eq = q->elevator;
+       struct io_group *iog = NULL, *new_iog = NULL;
 
        ioq = elv_active_ioq(eq);
 
@@ -1509,14 +2395,26 @@ int elv_should_preempt(struct request_queue *q, struct 
io_queue *new_ioq,
 
        /*
         * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
+        *
+        * TODO: In hierarchical setup, one need to traverse up the hier
+        * till both the queues are children of same parent to make a
+        * decision whether to do the preemption or not. Something like
+        * what cfs has done for cpu scheduler. Will do it little later.
         */
        if (elv_ioq_class_rt(new_ioq) && !elv_ioq_class_rt(ioq))
                return 1;
 
+       iog = ioq_to_io_group(ioq);
+       new_iog = ioq_to_io_group(new_ioq);
+
        /*
-        * Check with io scheduler if it has additional criterion based on
-        * which it wants to preempt existing queue.
+        * If both the queues belong to same group, check with io scheduler
+        * if it has additional criterion based on which it wants to
+        * preempt existing queue.
         */
+       if (iog != new_iog)
+               return 0;
+
        if (eq->ops->elevator_should_preempt_fn)
                return eq->ops->elevator_should_preempt_fn(q, new_ioq, rq);
 
@@ -1938,14 +2836,6 @@ void elv_ioq_completed_request(struct request_queue *q, 
struct request *rq)
                elv_schedule_dispatch(q);
 }
 
-struct io_group *io_lookup_io_group_current(struct request_queue *q)
-{
-       struct elv_fq_data *efqd = &q->elevator->efqd;
-
-       return efqd->root_group;
-}
-EXPORT_SYMBOL(io_lookup_io_group_current);
-
 void *io_group_async_queue_prio(struct io_group *iog, int ioprio_class,
                                        int ioprio)
 {
@@ -1996,44 +2886,6 @@ void io_group_set_async_queue(struct io_group *iog, int 
ioprio_class,
 }
 EXPORT_SYMBOL(io_group_set_async_queue);
 
-/*
- * Release all the io group references to its async queues.
- */
-void io_put_io_group_queues(struct elevator_queue *e, struct io_group *iog)
-{
-       int i, j;
-
-       for (i = 0; i < 2; i++)
-               for (j = 0; j < IOPRIO_BE_NR; j++)
-                       elv_release_ioq(e, &iog->async_queue[i][j]);
-
-       /* Free up async idle queue */
-       elv_release_ioq(e, &iog->async_idle_queue);
-}
-
-struct io_group *io_alloc_root_group(struct request_queue *q,
-                                       struct elevator_queue *e, void *key)
-{
-       struct io_group *iog;
-       int i;
-
-       iog = kmalloc_node(sizeof(*iog), GFP_KERNEL | __GFP_ZERO, q->node);
-       if (iog == NULL)
-               return NULL;
-
-       for (i = 0; i < IO_IOPRIO_CLASSES; i++)
-               iog->sched_data.service_tree[i] = IO_SERVICE_TREE_INIT;
-
-       return iog;
-}
-
-void io_free_root_group(struct elevator_queue *e)
-{
-       struct io_group *iog = e->efqd.root_group;
-       io_put_io_group_queues(e, iog);
-       kfree(iog);
-}
-
 static void elv_slab_kill(void)
 {
        /*
@@ -2079,6 +2931,7 @@ int elv_init_fq_data(struct request_queue *q, struct 
elevator_queue *e)
        INIT_WORK(&efqd->unplug_work, elv_kick_queue);
 
        INIT_LIST_HEAD(&efqd->idle_list);
+       INIT_HLIST_HEAD(&efqd->group_list);
 
        efqd->elv_slice[0] = elv_slice_async;
        efqd->elv_slice[1] = elv_slice_sync;
@@ -2108,10 +2961,14 @@ void elv_exit_fq_data(struct elevator_queue *e)
        spin_lock_irq(q->queue_lock);
        /* This should drop all the idle tree references of ioq */
        elv_free_idle_ioq_list(e);
+       /* This should drop all the io group references of async queues */
+       io_disconnect_groups(e);
        spin_unlock_irq(q->queue_lock);
 
        elv_shutdown_timer_wq(e);
 
+       /* Wait for iog->key accessors to exit their grace periods. */
+       synchronize_rcu();
        BUG_ON(timer_pending(&efqd->idle_slice_timer));
        io_free_root_group(e);
 }
diff --git a/block/elevator-fq.h b/block/elevator-fq.h
index ce2d671..8c60cf7 100644
--- a/block/elevator-fq.h
+++ b/block/elevator-fq.h
@@ -9,11 +9,13 @@
  */
 
 #include <linux/blkdev.h>
+#include <linux/cgroup.h>
 
 #ifndef _BFQ_SCHED_H
 #define _BFQ_SCHED_H
 
 #define IO_IOPRIO_CLASSES      3
+#define WEIGHT_MAX             1000
 
 typedef u64 bfq_timestamp_t;
 typedef unsigned long bfq_weight_t;
@@ -69,6 +71,7 @@ struct io_service_tree {
  */
 struct io_sched_data {
        struct io_entity *active_entity;
+       struct io_entity *next_active;
        struct io_service_tree service_tree[IO_IOPRIO_CLASSES];
 };
 
@@ -84,13 +87,12 @@ struct io_sched_data {
  *             this entity; used for O(log N) lookups into active trees.
  * @service: service received during the last round of service.
  * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.
- * @weight: weight of the queue, calculated as IOPRIO_BE_NR - @ioprio.
  * @parent: parent entity, for hierarchical scheduling.
  * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the
  *                 associated scheduler queue, %NULL on leaf nodes.
  * @sched_data: the scheduler queue this entity belongs to.
- * @ioprio: the ioprio in use.
- * @new_ioprio: when an ioprio change is requested, the new ioprio value
+ * @weight: the weight in use.
+ * @new_weight: when a weight change is requested, the new weight value
  * @ioprio_class: the ioprio_class in use.
  * @new_ioprio_class: when an ioprio_class change is requested, the new
  *                    ioprio_class value.
@@ -132,13 +134,13 @@ struct io_entity {
        bfq_timestamp_t min_start;
 
        bfq_service_t service, budget;
-       bfq_weight_t weight;
 
        struct io_entity *parent;
 
        struct io_sched_data *my_sched_data;
        struct io_sched_data *sched_data;
 
+       bfq_weight_t weight, new_weight;
        unsigned short ioprio, new_ioprio;
        unsigned short ioprio_class, new_ioprio_class;
 
@@ -180,6 +182,75 @@ struct io_queue {
        void *sched_queue;
 };
 
+#ifdef CONFIG_GROUP_IOSCHED
+/**
+ * struct bfq_group - per (device, cgroup) data structure.
+ * @entity: schedulable entity to insert into the parent group sched_data.
+ * @sched_data: own sched_data, to contain child entities (they may be
+ *              both bfq_queues and bfq_groups).
+ * @group_node: node to be inserted into the bfqio_cgroup->group_data
+ *              list of the containing cgroup's bfqio_cgroup.
+ * @bfqd_node: node to be inserted into the @bfqd->group_list list
+ *             of the groups active on the same device; used for cleanup.
+ * @bfqd: the bfq_data for the device this group acts upon.
+ * @async_bfqq: array of async queues for all the tasks belonging to
+ *              the group, one queue per ioprio value per ioprio_class,
+ *              except for the idle class that has only one queue.
+ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
+ * @my_entity: pointer to @entity, %NULL for the toplevel group; used
+ *             to avoid too many special cases during group creation/migration.
+ *
+ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
+ * there is a set of bfq_groups, each one collecting the lower-level
+ * entities belonging to the group that are acting on the same device.
+ *
+ * Locking works as follows:
+ *    o @group_node is protected by the bfqio_cgroup lock, and is accessed
+ *      via RCU from its readers.
+ *    o @bfqd is protected by the queue lock, RCU is used to access it
+ *      from the readers.
+ *    o All the other fields are protected by the @bfqd queue lock.
+ */
+struct io_group {
+       struct io_entity entity;
+       struct hlist_node elv_data_node;
+       struct hlist_node group_node;
+       struct io_sched_data sched_data;
+
+       struct io_entity *my_entity;
+
+       /*
+        * A cgroup has multiple io_groups, one for each request queue.
+        * to find io group belonging to a particular queue, elv_fq_data
+        * pointer is stored as a key.
+        */
+       void *key;
+
+       /* async_queue and idle_queue are used only for cfq */
+       struct io_queue *async_queue[2][IOPRIO_BE_NR];
+       struct io_queue *async_idle_queue;
+};
+
+/**
+ * struct bfqio_cgroup - bfq cgroup data structure.
+ * @css: subsystem state for bfq in the containing cgroup.
+ * @weight: cgroup weight.
+ * @ioprio_class: cgroup ioprio_class.
+ * @lock: spinlock that protects @weight, @ioprio_class and @group_data.
+ * @group_data: list containing the bfq_group belonging to this cgroup.
+ *
+ * @group_data is accessed using RCU, with @lock protecting the updates,
+ * @weight and @ioprio_class are protected by @lock.
+ */
+struct io_cgroup {
+       struct cgroup_subsys_state css;
+
+       unsigned long weight, ioprio_class;
+
+       spinlock_t lock;
+       struct hlist_head group_data;
+};
+#else
 struct io_group {
        struct io_sched_data sched_data;
 
@@ -187,10 +258,14 @@ struct io_group {
        struct io_queue *async_queue[2][IOPRIO_BE_NR];
        struct io_queue *async_idle_queue;
 };
+#endif
 
 struct elv_fq_data {
        struct io_group *root_group;
 
+       /* List of io groups hanging on this elevator */
+       struct hlist_head group_list;
+
        /* List of io queues on idle tree. */
        struct list_head idle_list;
 
@@ -375,9 +450,20 @@ static inline void elv_ioq_set_ioprio_class(struct 
io_queue *ioq,
        ioq->entity.ioprio_changed = 1;
 }
 
+/**
+ * bfq_ioprio_to_weight - calc a weight from an ioprio.
+ * @ioprio: the ioprio value to convert.
+ */
+static inline bfq_weight_t bfq_ioprio_to_weight(int ioprio)
+{
+       WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);
+       return ((IOPRIO_BE_NR - ioprio) * WEIGHT_MAX)/IOPRIO_BE_NR;
+}
+
 static inline void elv_ioq_set_ioprio(struct io_queue *ioq, int ioprio)
 {
        ioq->entity.new_ioprio = ioprio;
+       ioq->entity.new_weight = bfq_ioprio_to_weight(ioprio);
        ioq->entity.ioprio_changed = 1;
 }
 
@@ -394,6 +480,50 @@ static inline struct io_group *ioq_to_io_group(struct 
io_queue *ioq)
                                                sched_data);
 }
 
+#ifdef CONFIG_GROUP_IOSCHED
+extern int io_group_allow_merge(struct request *rq, struct bio *bio);
+extern void io_ioq_move(struct elevator_queue *e, struct io_queue *ioq,
+                                       struct io_group *iog);
+extern void elv_fq_set_request_io_group(struct request_queue *q,
+                                               struct request *rq);
+static inline bfq_weight_t iog_weight(struct io_group *iog)
+{
+       return iog->entity.weight;
+}
+
+#else /* !GROUP_IOSCHED */
+/*
+ * No ioq movement is needed in case of flat setup. root io group gets cleaned
+ * up upon elevator exit and before that it has been made sure that both
+ * active and idle tree are empty.
+ */
+static inline void io_ioq_move(struct elevator_queue *e, struct io_queue *ioq,
+                                       struct io_group *iog)
+{
+}
+
+static inline int io_group_allow_merge(struct request *rq, struct bio *bio)
+{
+       return 1;
+}
+/*
+ * Currently root group is not part of elevator group list and freed
+ * separately. Hence in case of non-hierarchical setup, nothing todo.
+ */
+static inline void io_disconnect_groups(struct elevator_queue *e) {}
+static inline void elv_fq_set_request_io_group(struct request_queue *q,
+                                               struct request *rq)
+{
+}
+
+static inline bfq_weight_t iog_weight(struct io_group *iog)
+{
+       /* Just root group is present and weight is immaterial. */
+       return 0;
+}
+
+#endif /* GROUP_IOSCHED */
+
 /* Functions used by blksysfs.c */
 extern ssize_t elv_slice_idle_show(struct request_queue *q, char *name);
 extern ssize_t elv_slice_idle_store(struct request_queue *q, const char *name,
@@ -495,5 +625,16 @@ static inline void *elv_fq_select_ioq(struct request_queue 
*q, int force)
 {
        return NULL;
 }
+
+static inline void elv_fq_set_request_io_group(struct request_queue *q,
+                                               struct request *rq)
+{
+}
+
+static inline int io_group_allow_merge(struct request *rq, struct bio *bio)
+
+{
+       return 1;
+}
 #endif /* CONFIG_ELV_FAIR_QUEUING */
 #endif /* _BFQ_SCHED_H */
diff --git a/block/elevator.c b/block/elevator.c
index c2f07f5..4321169 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -105,6 +105,10 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio)
        if (bio_integrity(bio) != blk_integrity_rq(rq))
                return 0;
 
+       /* If rq and bio belongs to different groups, dont allow merging */
+       if (!io_group_allow_merge(rq, bio))
+               return 0;
+
        if (!elv_iosched_allow_merge(rq, bio))
                return 0;
 
@@ -913,6 +917,8 @@ int elv_set_request(struct request_queue *q, struct request 
*rq, gfp_t gfp_mask)
 {
        struct elevator_queue *e = q->elevator;
 
+       elv_fq_set_request_io_group(q, rq);
+
        if (e->ops->elevator_set_req_fn)
                return e->ops->elevator_set_req_fn(q, rq, gfp_mask);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 4634949..9c209a0 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -249,7 +249,12 @@ struct request {
 #ifdef CONFIG_ELV_FAIR_QUEUING
        /* io queue request belongs to */
        struct io_queue *ioq;
-#endif
+
+#ifdef CONFIG_GROUP_IOSCHED
+       /* io group request belongs to */
+       struct io_group *iog;
+#endif /* GROUP_IOSCHED */
+#endif /* ELV_FAIR_QUEUING */
 };
 
 static inline unsigned short req_get_ioprio(struct request *req)
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 9c8d31b..68ea6bd 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -60,3 +60,10 @@ SUBSYS(net_cls)
 #endif
 
 /* */
+
+#ifdef CONFIG_GROUP_IOSCHED
+SUBSYS(io)
+#endif
+
+/* */
+
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index 08b987b..51664bb 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -73,6 +73,11 @@ struct io_context {
        unsigned short ioprio;
        unsigned short ioprio_changed;
 
+#ifdef CONFIG_GROUP_IOSCHED
+       /* If task changes the cgroup, elevator processes it asynchronously */
+       unsigned short cgroup_changed;
+#endif
+
        /*
         * For request batching
         */
diff --git a/init/Kconfig b/init/Kconfig
index 7be4d38..ab76477 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -606,6 +606,14 @@ config CGROUP_MEM_RES_CTLR_SWAP
          Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page
          size is 4096bytes, 512k per 1Gbytes of swap.
 
+config GROUP_IOSCHED
+       bool "Group IO Scheduler"
+       depends on CGROUPS && ELV_FAIR_QUEUING
+       default n
+       ---help---
+         This feature lets IO scheduler recognize task groups and control
+         disk bandwidth allocation to such task groups.
+
 endif # CGROUPS
 
 config MM_OWNER
-- 
1.6.0.1

_______________________________________________
Containers mailing list
[email protected]
https://lists.linux-foundation.org/mailman/listinfo/containers

_______________________________________________
Devel mailing list
[email protected]
https://openvz.org/mailman/listinfo/devel

[Devel] [PATCH 05/18] io-controller: Common hierarchical fair queuing code in elevaotor layer

Reply via email to