Re: [ckrm-tech] [PATCH] [2/3] 02-clone-cfq.patch

Gerrit Huizenga Thu, 12 May 2005 09:06:02 -0700

I have a few comments interspersed.

First, can we get a better subject/title for this?  It looks like it
is really the Proportional share I/O scheduler, not the "clone-cfq"
patch.


Also, a description at the head of the patch helps for sending things
on to Andrew.  It looks like the Kconfig documentation might be a good
start for that header.  That header is what gets used as the change log
when the patch is finally committed, so it should be as descriptive as
possible.

On Wed, 11 May 2005 17:25:37 PDT, Chandra Seetharaman wrote:
> 
>  drivers/block/Kconfig.iosched |    9 
>  drivers/block/Makefile        |    1 
>  drivers/block/ps-iosched.c    | 1852 
> ++++++++++++++++++++++++++++++++++++++++++
>  include/linux/blkdev.h        |   16 
>  4 files changed, 1878 insertions(+)
> 
> Signed-off-by:  Shailabh Nagar <[EMAIL PROTECTED]>
> Signed-off-by:  Chandra Seetharaman <[EMAIL PROTECTED]> 
> 
> Index: linux-2.6.12-rc3/drivers/block/Kconfig.iosched
> ===================================================================
> --- linux-2.6.12-rc3.orig/drivers/block/Kconfig.iosched
> +++ linux-2.6.12-rc3/drivers/block/Kconfig.iosched
> @@ -38,4 +38,13 @@ config IOSCHED_CFQ
>         among all processes in the system. It should provide a fair
>         working environment, suitable for desktop systems.
>  
> +config IOSCHED_PS
> +     tristate "Proportional share I/O scheduler"
> +     default y
> +     ---help---
> +       The PS I/O scheduler apportions disk I/O bandwidth amongst classes
> +       defined through CKRM (Class-based Kernel Resource Management). It
> +       is based on CFQ but differs in the interface used (CKRM) and 
> +       implementation of differentiated service. 
> +
>  endmenu
> Index: linux-2.6.12-rc3/drivers/block/Makefile
> ===================================================================
> --- linux-2.6.12-rc3.orig/drivers/block/Makefile
> +++ linux-2.6.12-rc3/drivers/block/Makefile
> @@ -19,6 +19,7 @@ obj-$(CONFIG_IOSCHED_NOOP)  += noop-iosch
>  obj-$(CONFIG_IOSCHED_AS)     += as-iosched.o
>  obj-$(CONFIG_IOSCHED_DEADLINE)       += deadline-iosched.o
>  obj-$(CONFIG_IOSCHED_CFQ)    += cfq-iosched.o
> +obj-$(CONFIG_IOSCHED_PS)     += ps-iosched.o
>  obj-$(CONFIG_MAC_FLOPPY)     += swim3.o
>  obj-$(CONFIG_BLK_DEV_FD)     += floppy.o
>  obj-$(CONFIG_BLK_DEV_FD98)   += floppy98.o
> Index: linux-2.6.12-rc3/drivers/block/ps-iosched.c
> ===================================================================
> --- /dev/null
> +++ linux-2.6.12-rc3/drivers/block/ps-iosched.c
> @@ -0,0 +1,1852 @@
> +/*
> + *  linux/drivers/block/ps-iosched.c
> + *
> + *  PS, or proportional share disk scheduler for use with 
> + *  Class-based Kernel Resource Management (CKRM).
> + *
> + *  Very similar to Completely Fair Queueing (CFQ) disk scheduler
> + *  written by Jens Axboe.

I see this comment through the patch a few times but I don't see a lot
of code sharing going on.  I'll try to point out cases where it seems
like we might want to share more code before pushing towards mainline.

> + *
> + *  Copyright (C) 2005 Shailabh Nagar <[EMAIL PROTECTED]>
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/fs.h>
> +#include <linux/blkdev.h>
> +#include <linux/elevator.h>
> +#include <linux/bio.h>
> +#include <linux/config.h>
> +#include <linux/module.h>
> +#include <linux/slab.h>
> +#include <linux/init.h>
> +#include <linux/compiler.h>
> +#include <linux/hash.h>
> +#include <linux/rbtree.h>
> +#include <linux/mempool.h>
> +
> +static unsigned long max_elapsed_prq;
> +static unsigned long max_elapsed_dispatch;
> +
> +/*
> + * tunables
> + */
> +static int ps_quantum = 4;           /* max queue in one round of service */
> +static int ps_queued = 8;            /* minimum rq allocate limit per-queue*/
> +static int ps_service = HZ;          /* period over which service is avg */
> +static int ps_fifo_expire_r = HZ / 2;        /* fifo timeout for sync 
> requests */
> +static int ps_fifo_expire_w = 5 * HZ;        /* fifo timeout for async 
> requests */
> +static int ps_fifo_rate = HZ / 8;    /* fifo expiry rate */
> +static int ps_back_max = 16 * 1024;  /* maximum backwards seek, in KiB */
> +static int ps_back_penalty = 2;      /* penalty of a backwards seek */
> +
> +/*
> + * for the hash of psq inside the psd
> + */
> +#define PS_QHASH_SHIFT               6
> +#define PS_QHASH_ENTRIES     (1 << PS_QHASH_SHIFT)
> +#define list_entry_qhash(entry)      hlist_entry((entry), struct ps_queue, 
> ps_hash)
> +
> +/*
> + * for the hash of prq inside the psq
> + */
> +#define PS_MHASH_SHIFT               6
> +#define PS_MHASH_BLOCK(sec)  ((sec) >> 3)
> +#define PS_MHASH_ENTRIES     (1 << PS_MHASH_SHIFT)
> +#define PS_MHASH_FN(sec)     hash_long(PS_MHASH_BLOCK(sec), PS_MHASH_SHIFT)
> +#define rq_hash_key(rq)              ((rq)->sector + (rq)->nr_sectors)
> +#define list_entry_hash(ptr) hlist_entry((ptr), struct ps_rq, hash)
> +
> +#define list_entry_psq(ptr)  list_entry((ptr), struct ps_queue, ps_list)
> +
> +#define RQ_DATA(rq)          (rq)->elevator_private
> +
> +/*
> + * rb-tree defines
> + */
> +#define RB_NONE                      (2)
> +#define RB_EMPTY(node)               ((node)->rb_node == NULL)
> +#define RB_CLEAR_COLOR(node) (node)->rb_color = RB_NONE
> +#define RB_CLEAR(node)               do {    \
> +     (node)->rb_parent = NULL;       \
> +     RB_CLEAR_COLOR((node));         \
> +     (node)->rb_right = NULL;        \
> +     (node)->rb_left = NULL;         \
> +} while (0)
> +#define RB_CLEAR_ROOT(root)  ((root)->rb_node = NULL)
> +#define ON_RB(node)          ((node)->rb_color != RB_NONE)
> +#define rb_entry_prq(node)   rb_entry((node), struct ps_rq, rb_node)
> +#define rq_rb_key(rq)                (rq)->sector
> +
> +/*
> + * threshold for switching off non-tag accounting
> + */
> +#define PS_MAX_TAG           (4)
> +
> +/*
> + * sort key types and names
> + */
> +enum {
> +     PS_KEY_PGID,
> +     PS_KEY_TGID,
> +     PS_KEY_UID,
> +     PS_KEY_GID,
> +     PS_KEY_LAST,
> +};
> +
> +static char *ps_key_types[] = { "pgid", "tgid", "uid", "gid", NULL };
> +
> +static kmem_cache_t *prq_pool;
> +static kmem_cache_t *ps_pool;
> +static kmem_cache_t *ps_ioc_pool;
> +
> +struct ps_data {

This sounds a lot like the ps(1) command's data.  An unfortunately
choice of names but I don't see any conflicts.

> +
> +struct ps_queue {
> +     /* reference count */
> +     atomic_t ref;
> +     /* parent ps_data */
> +     struct ps_data *psd;
> +     /* hash of mergeable requests */
> +     struct hlist_node ps_hash;
> +     /* hash key */
> +     unsigned long key;
> +     /* whether queue is on rr (or empty) list */
> +     int on_rr;
> +     /* on either rr or empty list of psd */
> +     struct list_head ps_list;
> +     /* sorted list of pending requests */
> +     struct rb_root sort_list;
> +     /* if fifo isn't expired, next request to serve */
> +     struct ps_rq *next_prq;
> +     /* requests queued in sort_list */
> +     int queued[2];
> +     /* currently allocated requests */
> +     int allocated[2];
> +     /* fifo list of requests in sort_list */
> +     struct list_head fifo[2];
> +     /* last time fifo expired */
> +     unsigned long last_fifo_expire;

Can these comments be moved to the right of the element that they
go with?  It looks like the comments are first and this is very hard
to read.

> +
> +     int key_type;
> +
> +     unsigned long service_start;
> +     unsigned long service_used;
> +
> +     unsigned int max_rate;
> +
> +     /* number of requests that have been handed to the driver */
> +     int in_flight;
> +     /* number of currently allocated requests */
> +     int alloc_limit[2];

Same here, and brief comments added for the rest of the elements
would be nice.

> +};
> +
> +struct ps_rq {
> +     struct rb_node rb_node;
> +     sector_t rb_key;
> +     struct request *request;
> +     struct hlist_node hash;
> +
> +     struct ps_queue *ps_queue;
> +     struct ps_io_context *io_context;
> +
> +     unsigned long service_start;
> +     unsigned long queue_start;
> +
> +     unsigned int in_flight : 1;
> +     unsigned int accounted : 1;
> +     unsigned int is_sync   : 1;
> +     unsigned int is_write  : 1;

I thought bit fields were verboten?  I don't know the state of the
various GCC compilers but this has often been a problem area.  If
this is also done in the CFQ scheduler, I'm less worried, but if
not, we should switch these to enums or bitmasks.

> +};
> +
> +static struct ps_queue *ps_find_ps_hash(struct ps_data *, unsigned long);
> +static void ps_dispatch_sort(request_queue_t *, struct ps_rq *);
> +static void ps_update_next_prq(struct ps_rq *);
> +static void ps_put_psd(struct ps_data *psd);
> +
> +/*
> + * what the fairness is based on (ie how processes are grouped and
> + * differentiated)
> + */

This comment could be a bit more descriptive.  I think from the code,
this might be something like:  What is the grouping key for sorting
and differentiating IO requests, e.g. Process Group, Thread Group (?),
UID, GID, etc.

> +static inline unsigned long
> +ps_hash_key(struct ps_data *psd, struct task_struct *tsk)
> +{
> +     /*
> +      * optimize this so that ->key_type is the offset into the struct
> +      */

On the comment - is that what the code does, or is that a directive
for future improvement?

> +     switch (psd->key_type) {
> +             case PS_KEY_PGID:
> +                     return process_group(tsk);
> +             default:
> +             case PS_KEY_TGID:
> +                     return tsk->tgid;
> +             case PS_KEY_UID:
> +                     return tsk->uid;
> +             case PS_KEY_GID:
> +                     return tsk->gid;
> +     }
> +}
> +
> +/*
> + * lots of deadline iosched dupes, can be abstracted later...
> + */

Now is later.  Time to abstract this sort of thing and create
commonality which will hopefully shorten the number of lines of
code added overall.

> +static inline void ps_del_prq_hash(struct ps_rq *prq)
> +{
> +     hlist_del_init(&prq->hash);
> +}
> +
> +static void ps_remove_merge_hints(request_queue_t *q, struct ps_rq *prq)
> +{
> +     ps_del_prq_hash(prq);
> +
> +     if (q->last_merge == prq->request)
> +             q->last_merge = NULL;
> +
> +     ps_update_next_prq(prq);
> +}
> +
> +static inline void ps_add_prq_hash(struct ps_data *psd, struct ps_rq *prq)
> +{
> +     const int hash_idx = PS_MHASH_FN(rq_hash_key(prq->request));
> +
> +     BUG_ON(!hlist_unhashed(&prq->hash));

Do we really want to panic here?  Why would an entry not be hashed?
Why isn't there a more graceful recovery if it *could* happen?  How
could we prevent this from happening in the first place?

> +
> +     hlist_add_head(&prq->hash, &psd->prq_hash[hash_idx]);
> +}
> +
> +static struct request *ps_find_rq_hash(struct ps_data *psd, sector_t offset)
> +{
> +     struct hlist_head *hash_list = &psd->prq_hash[PS_MHASH_FN(offset)];
> +     struct hlist_node *entry, *next;
> +
> +     hlist_for_each_safe(entry, next, hash_list) {
> +             struct ps_rq *prq = list_entry_hash(entry);
> +             struct request *__rq = prq->request;
> +
> +             BUG_ON(hlist_unhashed(&prq->hash));

Ditto on the previous "do we really want to panic here" comment.

> +
> +             if (!rq_mergeable(__rq)) {
> +                     ps_del_prq_hash(prq);
> +                     continue;
> +             }
> +
> +             if (rq_hash_key(__rq) == offset)
> +                     return __rq;
> +     }
> +
> +     return NULL;
> +}
> +
> +/*
> + * Lifted from AS - choose which of prq1 and prq2 that is best served now.
> + * We choose the request that is closest to the head right now. Distance
> + * behind the head are penalized and only allowed to a certain extent.
> + */

Again, extract common code into functions, minimize the copying and
cloning.  Otherwise bug fixes will make it into one routine and not
the other.

> +static struct ps_rq *
> +ps_choose_req(struct ps_data *psd, struct ps_rq *prq1, struct ps_rq *prq2)
> +{
> +     sector_t last, s1, s2, d1 = 0, d2 = 0;
> +     int r1_wrap = 0, r2_wrap = 0;   /* requests are behind the disk head */
> +     unsigned long back_max;
> +
> +     if (prq1 == NULL || prq1 == prq2)
> +             return prq2;
> +     if (prq2 == NULL)
> +             return prq1;
> +
> +     s1 = prq1->request->sector;
> +     s2 = prq2->request->sector;
> +
> +     last = psd->last_sector;
> +
> +#if 0
> +     if (!list_empty(&psd->queue->queue_head)) {
> +             struct list_head *entry = &psd->queue->queue_head;
> +             unsigned long distance = ~0UL;
> +             struct request *rq;
> +
> +             while ((entry = entry->prev) != &psd->queue->queue_head) {
> +                     rq = list_entry_rq(entry);
> +
> +                     if (blk_barrier_rq(rq))
> +                             break;
> +
> +                     if (distance < abs(s1 - rq->sector + rq->nr_sectors)) {
> +                             distance = abs(s1 - rq->sector +rq->nr_sectors);
> +                             last = rq->sector + rq->nr_sectors;
> +                     }
> +                     if (distance < abs(s2 - rq->sector + rq->nr_sectors)) {
> +                             distance = abs(s2 - rq->sector +rq->nr_sectors);
> +                             last = rq->sector + rq->nr_sectors;
> +                     }
> +             }
> +     }
> +#endif

If 0 should be removed.  If you want something like this for debugging,
create a patch that is added on after this one with the debugging code.

> +/*
> + * queue lock held here
> + */
> +static void ps_put_request(request_queue_t *q, struct request *rq)
> +{
> +     struct ps_data *psd = q->elevator->elevator_data;
> +     struct ps_rq *prq = RQ_DATA(rq);
> +
> +     if (prq) {
> +             struct ps_queue *psq = prq->ps_queue;
> +
> +             BUG_ON(q->last_merge == rq);
> +             BUG_ON(!hlist_unhashed(&prq->hash));

Lot's of BUG_ON's - why?  Do we really want to panic while running
IO?  Again, what can we do to avoid this?

> +
> +             if (prq->io_context)
> +                     put_io_context(prq->io_context->ioc);
> +
> +             BUG_ON(!psq->allocated[prq->is_write]);

And another...

> +             psq->allocated[prq->is_write]--;
> +
> +             mempool_free(prq, psd->prq_pool);
> +             rq->elevator_private = NULL;
> +
> +             smp_mb();
> +             ps_check_waiters(q, psq);
> +             ps_put_queue(psq);
> +     }
> +}
> +
> +/*
> + * Allocate ps data structures associated with this request. A queue and
> + */
> +static int ps_set_request(request_queue_t *q, struct request *rq, int 
> gfp_mask)
> +{
> +     struct ps_data *psd = q->elevator->elevator_data;
> +     struct ps_io_context *pic;
> +     const int rw = rq_data_dir(rq);
> +     struct ps_queue *psq, *saved_psq;
> +     struct ps_rq *prq;
> +     unsigned long flags;
> +
> +     might_sleep_if(gfp_mask & __GFP_WAIT);
> +
> +     spin_lock_irqsave(q->queue_lock, flags);
> +
> +     psq = __ps_get_queue(psd, ps_hash_key(psd, current), gfp_mask);
> +     if (!psq)
> +             goto out_lock;
> +
> +repeat:
> +     if (psq->allocated[rw] >= psd->max_queued)
> +             goto out_lock;
> +
> +     psq->allocated[rw]++;
> +     spin_unlock_irqrestore(q->queue_lock, flags);
> +
> +     /*
> +      * if hashing type has changed, the ps_queue might change here.
> +      */
> +     saved_psq = psq;
> +     pic = ps_get_io_context(&psq, gfp_mask);
> +     if (!pic)
> +             goto err;
> +
> +     /*
> +      * repeat allocation checks on queue change
> +      */
> +     if (unlikely(saved_psq != psq)) {
> +             spin_lock_irqsave(q->queue_lock, flags);
> +             saved_psq->allocated[rw]--;
> +             goto repeat;
> +     }
> +
> +     prq = mempool_alloc(psd->prq_pool, gfp_mask);
> +     if (prq) {
> +             RB_CLEAR(&prq->rb_node);
> +             prq->rb_key = 0;
> +             prq->request = rq;
> +             INIT_HLIST_NODE(&prq->hash);
> +             prq->ps_queue = psq;
> +             prq->io_context = pic;
> +             prq->service_start = prq->queue_start = 0;
> +             prq->in_flight = prq->accounted = prq->is_sync = 0;
> +             prq->is_write = rw;
> +             rq->elevator_private = prq;
> +             psq->alloc_limit[rw] = 0;
> +             return 0;
> +     }
> +
> +     put_io_context(pic->ioc);
> +err:
> +     spin_lock_irqsave(q->queue_lock, flags);
> +     psq->allocated[rw]--;
> +     ps_put_queue(psq);
> +out_lock:
> +     spin_unlock_irqrestore(q->queue_lock, flags);
> +     return 1;
> +}
> +
> +static void ps_put_psd(struct ps_data *psd)
> +{
> +     request_queue_t *q = psd->queue;
> +
> +     if (!atomic_dec_and_test(&psd->ref))
> +             return;
> +
> +     blk_put_queue(q);
> +
> +     mempool_destroy(psd->prq_pool);
> +     kfree(psd->prq_hash);
> +     kfree(psd->ps_hash);
> +     kfree(psd);
> +}
> +
> +static void ps_exit_queue(elevator_t *e)
> +{
> +     ps_put_psd(e->elevator_data);
> +}
> +
> +static int ps_init_queue(request_queue_t *q, elevator_t *e)
> +{
> +     struct ps_data *psd;
> +     int i;
> +
> +     psd = kmalloc(sizeof(*psd), GFP_KERNEL);
> +     if (!psd)
> +             return -ENOMEM;
> +
> +     memset(psd, 0, sizeof(*psd));
> +     INIT_LIST_HEAD(&psd->rr_list);
> +     INIT_LIST_HEAD(&psd->empty_list);
> +
> +     psd->prq_hash = kmalloc(sizeof(struct hlist_head) * PS_MHASH_ENTRIES, 
> GFP_KERNEL);
> +     if (!psd->prq_hash)
> +             goto out_prqhash;
> +
> +     psd->ps_hash = kmalloc(sizeof(struct hlist_head) * PS_QHASH_ENTRIES, 
> GFP_KERNEL);
> +     if (!psd->ps_hash)
> +             goto out_pshash;
> +
> +     psd->prq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, 
> mempool_free_slab, prq_pool);
> +     if (!psd->prq_pool)
> +             goto out_prqpool;
> +
> +     for (i = 0; i < PS_MHASH_ENTRIES; i++)
> +             INIT_HLIST_HEAD(&psd->prq_hash[i]);
> +     for (i = 0; i < PS_QHASH_ENTRIES; i++)
> +             INIT_HLIST_HEAD(&psd->ps_hash[i]);
> +
> +     e->elevator_data = psd;
> +
> +     psd->queue = q;
> +     atomic_inc(&q->refcnt);
> +
> +     /*
> +      * just set it to some high value, we want anyone to be able to queue
> +      * some requests. fairness is handled differently
> +      */
> +     q->nr_requests = 1024;
> +     psd->max_queued = q->nr_requests / 16;
> +     q->nr_batching = ps_queued;
> +     psd->key_type = PS_KEY_TGID;
> +     psd->find_best_prq = 1;
> +     atomic_set(&psd->ref, 1);
> +
> +     psd->ps_queued = ps_queued;
> +     psd->ps_quantum = ps_quantum;
> +     psd->ps_fifo_expire_r = ps_fifo_expire_r;
> +     psd->ps_fifo_expire_w = ps_fifo_expire_w;
> +     psd->ps_fifo_batch_expire = ps_fifo_rate;
> +     psd->ps_back_max = ps_back_max;
> +     psd->ps_back_penalty = ps_back_penalty;
> +
> +     return 0;
> +out_prqpool:
> +     kfree(psd->ps_hash);
> +out_pshash:
> +     kfree(psd->prq_hash);
> +out_prqhash:
> +     kfree(psd);
> +     return -ENOMEM;
> +}
> +
> +static void ps_slab_kill(void)
> +{
> +     if (prq_pool)
> +             kmem_cache_destroy(prq_pool);
> +     if (ps_pool)
> +             kmem_cache_destroy(ps_pool);
> +     if (ps_ioc_pool)
> +             kmem_cache_destroy(ps_ioc_pool);
> +}
> +
> +static int __init ps_slab_setup(void)
> +{
> +     prq_pool = kmem_cache_create("prq_pool", sizeof(struct ps_rq), 0, 0,
> +                                     NULL, NULL);
> +     if (!prq_pool)
> +             goto fail;
> +
> +     ps_pool = kmem_cache_create("ps_pool", sizeof(struct ps_queue), 0, 0,
> +                                     NULL, NULL);
> +     if (!ps_pool)
> +             goto fail;
> +
> +     ps_ioc_pool = kmem_cache_create("ps_ioc_pool",
> +                     sizeof(struct ps_io_context), 0, 0, NULL, NULL);
> +     if (!ps_ioc_pool)
> +             goto fail;
> +
> +     return 0;
> +fail:
> +     ps_slab_kill();
> +     return -ENOMEM;
> +}
> +

Seems like this might be a good point to split out this patch.

There is a LOT of code above this point.  Makes it very hard to do
any kind of a thorough review.  :(

> +
> +/*
> + * sysfs parts below -->
> + */
> +struct ps_fs_entry {
> +     struct attribute attr;
> +     ssize_t (*show)(struct ps_data *, char *);
> +     ssize_t (*store)(struct ps_data *, const char *, size_t);
> +};
> +
> +static ssize_t
> +ps_var_show(unsigned int var, char *page)
> +{
> +     return sprintf(page, "%d\n", var);
> +}
> +
> +static ssize_t
> +ps_var_store(unsigned int *var, const char *page, size_t count)
> +{
> +     char *p = (char *) page;
> +
> +     *var = simple_strtoul(p, &p, 10);
> +     return count;
> +}
> +
> +static ssize_t
> +ps_clear_elapsed(struct ps_data *psd, const char *page, size_t count)
> +{
> +     max_elapsed_dispatch = max_elapsed_prq = 0;
> +     return count;
> +}
> +
> +static ssize_t
> +ps_set_key_type(struct ps_data *psd, const char *page, size_t count)
> +{
> +     spin_lock_irq(psd->queue->queue_lock);
> +     if (!strncmp(page, "pgid", 4))
> +             psd->key_type = PS_KEY_PGID;
> +     else if (!strncmp(page, "tgid", 4))
> +             psd->key_type = PS_KEY_TGID;
> +     else if (!strncmp(page, "uid", 3))
> +             psd->key_type = PS_KEY_UID;
> +     else if (!strncmp(page, "gid", 3))
> +             psd->key_type = PS_KEY_GID;
> +     spin_unlock_irq(psd->queue->queue_lock);
> +     return count;
> +}
> +
> +static ssize_t
> +ps_read_key_type(struct ps_data *psd, char *page)
> +{
> +     ssize_t len = 0;
> +     int i;
> +
> +     for (i = PS_KEY_PGID; i < PS_KEY_LAST; i++) {
> +             if (psd->key_type == i)
> +                     len += sprintf(page+len, "[%s] ", ps_key_types[i]);
> +             else
> +                     len += sprintf(page+len, "%s ", ps_key_types[i]);
> +     }
> +     len += sprintf(page+len, "\n");
> +     return len;
> +}
> +
> +#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)                         \
> +static ssize_t __FUNC(struct ps_data *psd, char *page)               \
> +{                                                                    \
> +     unsigned int __data = __VAR;                                    \
> +     if (__CONV)                                                     \
> +             __data = jiffies_to_msecs(__data);                      \
> +     return ps_var_show(__data, (page));                             \
> +}
> +SHOW_FUNCTION(ps_quantum_show, psd->ps_quantum, 0);
> +SHOW_FUNCTION(ps_queued_show, psd->ps_queued, 0);
> +SHOW_FUNCTION(ps_fifo_expire_r_show, psd->ps_fifo_expire_r, 1);
> +SHOW_FUNCTION(ps_fifo_expire_w_show, psd->ps_fifo_expire_w, 1);
> +SHOW_FUNCTION(ps_fifo_batch_expire_show, psd->ps_fifo_batch_expire, 1);
> +SHOW_FUNCTION(ps_find_best_show, psd->find_best_prq, 0);
> +SHOW_FUNCTION(ps_back_max_show, psd->ps_back_max, 0);
> +SHOW_FUNCTION(ps_back_penalty_show, psd->ps_back_penalty, 0);
> +#undef SHOW_FUNCTION
> +
> +#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)                      
> \
> +static ssize_t __FUNC(struct ps_data *psd, const char *page, size_t count)   
> \
> +{                                                                    \
> +     unsigned int __data;                                            \
> +     int ret = ps_var_store(&__data, (page), count);         \
> +     if (__data < (MIN))                                             \
> +             __data = (MIN);                                         \
> +     else if (__data > (MAX))                                        \
> +             __data = (MAX);                                         \
> +     if (__CONV)                                                     \
> +             *(__PTR) = msecs_to_jiffies(__data);                    \
> +     else                                                            \
> +             *(__PTR) = __data;                                      \
> +     return ret;                                                     \
> +}
> +STORE_FUNCTION(ps_quantum_store, &psd->ps_quantum, 1, UINT_MAX, 0);
> +STORE_FUNCTION(ps_queued_store, &psd->ps_queued, 1, UINT_MAX, 0);
> +STORE_FUNCTION(ps_fifo_expire_r_store, &psd->ps_fifo_expire_r, 1, UINT_MAX, 
> 1);
> +STORE_FUNCTION(ps_fifo_expire_w_store, &psd->ps_fifo_expire_w, 1, UINT_MAX, 
> 1);
> +STORE_FUNCTION(ps_fifo_batch_expire_store, &psd->ps_fifo_batch_expire, 0, 
> UINT_MAX, 1);
> +STORE_FUNCTION(ps_find_best_store, &psd->find_best_prq, 0, 1, 0);
> +STORE_FUNCTION(ps_back_max_store, &psd->ps_back_max, 0, UINT_MAX, 0);
> +STORE_FUNCTION(ps_back_penalty_store, &psd->ps_back_penalty, 1, UINT_MAX, 0);
> +#undef STORE_FUNCTION
> +
> +static struct ps_fs_entry ps_quantum_entry = {
> +     .attr = {.name = "quantum", .mode = S_IRUGO | S_IWUSR },
> +     .show = ps_quantum_show,
> +     .store = ps_quantum_store,
> +};
> +static struct ps_fs_entry ps_queued_entry = {
> +     .attr = {.name = "queued", .mode = S_IRUGO | S_IWUSR },
> +     .show = ps_queued_show,
> +     .store = ps_queued_store,
> +};
> +static struct ps_fs_entry ps_fifo_expire_r_entry = {
> +     .attr = {.name = "fifo_expire_sync", .mode = S_IRUGO | S_IWUSR },
> +     .show = ps_fifo_expire_r_show,
> +     .store = ps_fifo_expire_r_store,
> +};
> +static struct ps_fs_entry ps_fifo_expire_w_entry = {
> +     .attr = {.name = "fifo_expire_async", .mode = S_IRUGO | S_IWUSR },
> +     .show = ps_fifo_expire_w_show,
> +     .store = ps_fifo_expire_w_store,
> +};
> +static struct ps_fs_entry ps_fifo_batch_expire_entry = {
> +     .attr = {.name = "fifo_batch_expire", .mode = S_IRUGO | S_IWUSR },
> +     .show = ps_fifo_batch_expire_show,
> +     .store = ps_fifo_batch_expire_store,
> +};
> +static struct ps_fs_entry ps_find_best_entry = {
> +     .attr = {.name = "find_best_prq", .mode = S_IRUGO | S_IWUSR },
> +     .show = ps_find_best_show,
> +     .store = ps_find_best_store,
> +};
> +static struct ps_fs_entry ps_back_max_entry = {
> +     .attr = {.name = "back_seek_max", .mode = S_IRUGO | S_IWUSR },
> +     .show = ps_back_max_show,
> +     .store = ps_back_max_store,
> +};
> +static struct ps_fs_entry ps_back_penalty_entry = {
> +     .attr = {.name = "back_seek_penalty", .mode = S_IRUGO | S_IWUSR },
> +     .show = ps_back_penalty_show,
> +     .store = ps_back_penalty_store,
> +};
> +static struct ps_fs_entry ps_clear_elapsed_entry = {
> +     .attr = {.name = "clear_elapsed", .mode = S_IWUSR },
> +     .store = ps_clear_elapsed,
> +};
> +static struct ps_fs_entry ps_key_type_entry = {
> +     .attr = {.name = "key_type", .mode = S_IRUGO | S_IWUSR },
> +     .show = ps_read_key_type,
> +     .store = ps_set_key_type,
> +};
> +
> +static struct attribute *default_attrs[] = {
> +     &ps_quantum_entry.attr,
> +     &ps_queued_entry.attr,
> +     &ps_fifo_expire_r_entry.attr,
> +     &ps_fifo_expire_w_entry.attr,
> +     &ps_fifo_batch_expire_entry.attr,
> +     &ps_key_type_entry.attr,
> +     &ps_find_best_entry.attr,
> +     &ps_back_max_entry.attr,
> +     &ps_back_penalty_entry.attr,
> +     &ps_clear_elapsed_entry.attr,
> +     NULL,
> +};
> +
> +#define to_ps(atr) container_of((atr), struct ps_fs_entry, attr)
> +
> +static ssize_t
> +ps_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
> +{
> +     elevator_t *e = container_of(kobj, elevator_t, kobj);
> +     struct ps_fs_entry *entry = to_ps(attr);
> +
> +     if (!entry->show)
> +             return 0;
> +
> +     return entry->show(e->elevator_data, page);
> +}
> +
> +static ssize_t
> +ps_attr_store(struct kobject *kobj, struct attribute *attr,
> +            const char *page, size_t length)
> +{
> +     elevator_t *e = container_of(kobj, elevator_t, kobj);
> +     struct ps_fs_entry *entry = to_ps(attr);
> +
> +     if (!entry->store)
> +             return -EINVAL;
> +
> +     return entry->store(e->elevator_data, page, length);
> +}
> +
> +static struct sysfs_ops ps_sysfs_ops = {
> +     .show   = ps_attr_show,
> +     .store  = ps_attr_store,
> +};
> +
> +struct kobj_type ps_ktype = {
> +     .sysfs_ops      = &ps_sysfs_ops,
> +     .default_attrs  = default_attrs,
> +};
> +
> +static struct elevator_type iosched_ps = {
> +     .ops = {
> +             .elevator_merge_fn =            ps_merge,
> +             .elevator_merged_fn =           ps_merged_request,
> +             .elevator_merge_req_fn =        ps_merged_requests,
> +             .elevator_next_req_fn =         ps_next_request,
> +             .elevator_add_req_fn =          ps_insert_request,
> +             .elevator_remove_req_fn =       ps_remove_request,
> +             .elevator_requeue_req_fn =      ps_requeue_request,
> +             .elevator_queue_empty_fn =      ps_queue_empty,
> +             .elevator_completed_req_fn =    ps_completed_request,
> +             .elevator_former_req_fn =       ps_former_request,
> +             .elevator_latter_req_fn =       ps_latter_request,
> +             .elevator_set_req_fn =          ps_set_request,
> +             .elevator_put_req_fn =          ps_put_request,
> +             .elevator_may_queue_fn =        ps_may_queue,
> +             .elevator_init_fn =             ps_init_queue,
> +             .elevator_exit_fn =             ps_exit_queue,
> +     },
> +     .elevator_ktype =       &ps_ktype,
> +     .elevator_name =        "ps",
> +     .elevator_owner =       THIS_MODULE,
> +};
> +
> +int ps_init(void)
> +{
> +     int ret;
> +
> +     if (ps_slab_setup())
> +             return -ENOMEM;
> +
> +     ret = elv_register(&iosched_ps);
> +     if (!ret) {
> +             __module_get(THIS_MODULE);
> +             return 0;
> +     }
> +
> +     ps_slab_kill();
> +     return ret;
> +}
> +
> +static void __exit ps_exit(void)
> +{
> +     ps_slab_kill();
> +     elv_unregister(&iosched_ps);
> +}
> +
> +module_init(ps_init);
> +module_exit(ps_exit);
> +
> +MODULE_AUTHOR("Shailabh Nagar");
> +MODULE_LICENSE("GPL");
> +MODULE_DESCRIPTION("CKRM Proportional Share IO scheduler");

This next section could also be a nice little patch of its own.
It just adds definitions, could be applied before anything else,
presumably without any risk/dependencies.

> Index: linux-2.6.12-rc3/include/linux/blkdev.h
> ===================================================================
> --- linux-2.6.12-rc3.orig/include/linux/blkdev.h
> +++ linux-2.6.12-rc3/include/linux/blkdev.h
> @@ -66,6 +66,21 @@ struct cfq_io_context {
>       struct cfq_queue *cfqq;
>  };
>  
> +struct ps_queue;
> +struct ps_io_context {
> +     void (*dtor)(struct ps_io_context *);
> +     void (*exit)(struct ps_io_context *);
> +
> +     struct io_context *ioc;
> +
> +     /*
> +      * circular list of cfq_io_contexts belonging to a process io context
> +      */
> +     struct list_head list;
> +     struct ps_queue *psq;
> +};
> +
> +
>  /*
>   * This is the per-process I/O subsystem state.  It is refcounted and
>   * kmalloc'ed. Currently all fields are modified in process io context
> @@ -85,6 +100,7 @@ struct io_context {
>  
>       struct as_io_context *aic;
>       struct cfq_io_context *cic;
> +     struct ps_io_context *pic;
>  };
>  
>  void put_io_context(struct io_context *ioc);
> 

Final comment, this is a LOT of code for one patch.  Also, I'd like
to see this Acked by Jens before pushing it further upstream.

gerrit


-------------------------------------------------------
This SF.Net email is sponsored by Oracle Space Sweepstakes
Want to be the first software developer in space?
Enter now for the Oracle Space Sweepstakes!
http://ads.osdn.com/?ad_id=7393&alloc_id=16281&op=click
_______________________________________________
ckrm-tech mailing list
https://lists.sourceforge.net/lists/listinfo/ckrm-tech

Re: [ckrm-tech] [PATCH] [2/3] 02-clone-cfq.patch

Reply via email to