Documentation/ckrm/ckrm-io    |   98 ++++
 drivers/block/Kconfig.iosched |    9 
 drivers/block/Makefile        |    4 
 drivers/block/ckrm-io.c       |  889 ++++++++++++++++++++++++++++++++++++++++++
 drivers/block/ps-iosched.c    |  345 +++++++++++++---
 include/linux/ckrm-io.h       |  134 ++++++
 include/linux/proc_fs.h       |    1 
 init/Kconfig                  |   13 
 8 files changed, 1421 insertions(+), 72 deletions(-)

Signed-off-by:  Shailabh Nagar <[EMAIL PROTECTED]>
Signed-off-by:  Chandra Seetharaman <[EMAIL PROTECTED]> 

Index: linux-2.6.12-rc3/Documentation/ckrm/ckrm-io
===================================================================
--- /dev/null
+++ linux-2.6.12-rc3/Documentation/ckrm/ckrm-io
@@ -0,0 +1,98 @@
+CKRM I/O controller
+
+Please send feedback to [EMAIL PROTECTED]
+
+
+The I/O controller consists of 
+- a new I/O scheduler called ps-iosched which is an incremental update 
+to the cfq ioscheduler. It has enough differences with cfq to warrant a
+separate I/O scheduler. 
+- ckrm-io : the controller which interfaces ps-iosched with CKRM's core
+
+ckrm-io enforces shares at the granularity of an "epoch", currently defined as
+1 second. The relative share of each class in rcfs is translated to an absolute
+"sectorate" for each block device managed by ckrm-io. Sectorate is defined as
+average number of sectors served per epoch for a class. This value is treated
+as a hard limit - every time a class exceeds this average for *any* device, the
+class' I/O gets deferred till the average drops back below the limit.
+
+Compiling ckrm-io
+-----------------
+Currently, please compile it into the kernel using the config parameter
+
+          General Setup
+                Class-based Kernel Resource Management --->
+                       Disk I/O Resource Controller
+                  
+A later version will fix the use of sched_clock() by ps-iosched.c that is
+preventing it from being compiled as a module.
+
+
+Using ckrm-io
+-------------
+
+1. Boot into the kernel and mount rcfs
+
+# mount -t rcfs none /rcfs 
+
+2. Choose a device to bring under ckrm-io's control (it is recommended you
+choose a disk not hosting your root filesystem until the controller gets tested
+better). For device hdc, use something like
+
+# echo "ps" > /sys/block/hdc/queue/scheduler
+# cat /sys/block/hdc/queue/scheduler
+noop anticipatory deadline cfq [ps]
+
+
+3. Verify rcfs root's sectorate
+
+# echo /rcfs/taskclass/stats
+res=io, abs limit 10000
+/block/hdc/queue skip .. timdout .. avsec .. rate .. sec0 .. sec1 ..
+
+"avsec" is the average number of sectors served for the class
+"rate" is its current limit 
+The rest of the numbers are of interest in debugging only.
+
+
+4. Launch  I/O workload(s) (dd has been used so far) in a separate terminal.
+Multiple instances of 
+
+# time dd if=/dev/hdc of=/dev/null bs=4096 count=1000000 &
+
+5. Watch the "avsec" and "rate" parameters in /rcfs/taskclass (do this in a
+separate terminal)
+
+# while : ; do cat /rcfs/taskclass/stats; sleep 1; done
+
+6a. Change the absolute sectorate for the root class
+
+# echo "res=io,rootsectorate=1000" > /rcfs/taskclass/config
+# echo "1000" > /sys/block/hdc/queue/ioscheduler/max_sectorate
+
+6b. Verify that "rate" has changed to the new value in the terminal where
+/rcfs/taskclass/stats is being monitored (step 5)
+
+
+Or just run the I/O workload twice, with different values of sectorate and see
+the difference in completion times.
+
+
+
+Current bugs/limitations
+------------------------
+
+- only the root taskclass can be controlled. The shares for children created
+  under /rcfs/taskclass do not change. 
+
+- Having two parameters to modify
+  "rootsectorate", settable within /rcfs/taskclass/config  and 
+  "max_sectorate", set as /sys/block/<device>/queue/ioscheduler/max_sectorate
+
+could be reduced to one (just the latter). 
+
+
+
+
+
+
Index: linux-2.6.12-rc3/drivers/block/Kconfig.iosched
===================================================================
--- linux-2.6.12-rc3.orig/drivers/block/Kconfig.iosched
+++ linux-2.6.12-rc3/drivers/block/Kconfig.iosched
@@ -38,13 +38,4 @@ config IOSCHED_CFQ
          among all processes in the system. It should provide a fair
          working environment, suitable for desktop systems.
 
-config IOSCHED_PS
-       tristate "Proportional share I/O scheduler"
-       default y
-       ---help---
-         The PS I/O scheduler apportions disk I/O bandwidth amongst classes
-         defined through CKRM (Class-based Kernel Resource Management). It
-         is based on CFQ but differs in the interface used (CKRM) and 
-         implementation of differentiated service. 
-
 endmenu
Index: linux-2.6.12-rc3/drivers/block/Makefile
===================================================================
--- linux-2.6.12-rc3.orig/drivers/block/Makefile
+++ linux-2.6.12-rc3/drivers/block/Makefile
@@ -13,13 +13,13 @@
 # kblockd threads
 #
 
-obj-y  := elevator.o ll_rw_blk.o ioctl.o genhd.o scsi_ioctl.o
+obj-y  := elevator.o ll_rw_blk.o ioctl.o genhd.o scsi_ioctl.o 
 
 obj-$(CONFIG_IOSCHED_NOOP)     += noop-iosched.o
 obj-$(CONFIG_IOSCHED_AS)       += as-iosched.o
 obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
 obj-$(CONFIG_IOSCHED_CFQ)      += cfq-iosched.o
-obj-$(CONFIG_IOSCHED_PS)       += ps-iosched.o
+obj-$(CONFIG_CKRM_RES_BLKIO)    += ckrm-io.o ps-iosched.o
 obj-$(CONFIG_MAC_FLOPPY)       += swim3.o
 obj-$(CONFIG_BLK_DEV_FD)       += floppy.o
 obj-$(CONFIG_BLK_DEV_FD98)     += floppy98.o
Index: linux-2.6.12-rc3/drivers/block/ckrm-io.c
===================================================================
--- /dev/null
+++ linux-2.6.12-rc3/drivers/block/ckrm-io.c
@@ -0,0 +1,889 @@
+/* linux/drivers/block/ckrm_io.c : Block I/O Resource Controller for CKRM
+ *
+ * Copyright (C) Shailabh Nagar, IBM Corp. 2004
+ * 
+ * 
+ * Provides best-effort block I/O bandwidth control for CKRM 
+ * This file provides the CKRM API. The underlying scheduler is the
+ * ps (proportional share) ioscheduler.
+ *
+ * Latest version, more details at http://ckrm.sf.net
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/parser.h>
+#include <linux/kobject.h>
+#include <asm/errno.h>
+#include <asm/div64.h>
+
+#include <linux/ckrm_tc.h>
+#include <linux/ckrm-io.h>
+
+#define CKI_UNUSED  1
+
+/* sectorate == 512 byte sectors served in PS_EPOCH ns*/
+
+#define CKI_ROOTSECTORATE_DEF  100000
+#define CKI_MINSECTORATE_DEF   100
+
+#define CKI_IOUSAGE_UNIT       512
+
+
+#if CKI_UNUSED
+typedef struct ckrm_io_stats{
+       struct timeval       epochstart ; /* all measurements relative to this 
+                                            start time */
+       unsigned long        blksz;  /* size of bandwidth unit */
+       atomic_t             blkrd;  /* read units submitted to DD */
+       atomic_t             blkwr; /* write units submitted to DD */
+
+} cki_stats_t;          /* per class I/O statistics */
+#endif
+
+typedef struct ckrm_io_class {
+
+       struct ckrm_core_class *core;
+       struct ckrm_core_class *parent;
+       
+       
+
+       struct ckrm_shares shares;
+       struct rw_semaphore  sem; /* protect rate_list and cnt_*  */
+       
+       struct list_head  rate_list;
+
+       /* Absolute shares of this class
+        * in local units. 
+        */
+       int cnt_guarantee; /* Allocation as parent */
+       int cnt_unused;    /* Allocation to default subclass */
+       int cnt_limit;
+
+#ifdef CKI_UNUSED
+       /* Statistics, for class and default subclass */
+       cki_stats_t stats; 
+       cki_stats_t mystats;
+#endif
+} cki_icls_t;
+
+/* Internal functions */
+static inline void cki_reset_stats(cki_stats_t *usg);
+static inline void init_icls_one(cki_icls_t *icls);
+static void cki_recalc_propagate(cki_icls_t *res, cki_icls_t *parres);
+
+/* Functions from ps_iosched */
+extern int ps_drop_psq(struct ps_data *psd, unsigned long key);
+
+
+/* CKRM Resource Controller API functions */
+static void * cki_alloc(struct ckrm_core_class *this,
+                       struct ckrm_core_class * parent);
+static void cki_free(void *res);
+static int cki_setshare(void *res, struct ckrm_shares * shares);
+static int cki_getshare(void *res, struct ckrm_shares * shares);
+static int cki_getstats(void *res, struct seq_file *);
+static int cki_resetstats(void *res);
+static int cki_showconfig(void *res, struct seq_file *sfile);
+static int cki_setconfig(void *res, const char *cfgstr);
+static void cki_chgcls(void *tsk, void *oldres, void *newres);
+
+/* Global data */
+struct ckrm_res_ctlr cki_rcbs;
+
+struct cki_data ckid;
+EXPORT_SYMBOL_GPL(ckid);
+
+struct ps_rate cki_def_psrate; 
+EXPORT_SYMBOL_GPL(cki_def_psrate);
+
+struct rw_semaphore psdlistsem;
+EXPORT_SYMBOL(psdlistsem);
+
+LIST_HEAD(ps_psdlist);
+EXPORT_SYMBOL(ps_psdlist);
+
+
+static struct psdrate *cki_find_rate(struct ckrm_io_class *icls,
+                                    struct ps_data *psd)
+{
+       struct psdrate *prate;
+       
+       down_read(&icls->sem);
+       list_for_each_entry(prate, &icls->rate_list, rate_list) {
+               if (prate->psd == psd)
+                       goto found;
+       }
+       prate = NULL;
+found:
+       up_read(&icls->sem);
+       return prate;
+}
+
+/* Exported functions */
+
+void cki_set_sectorate(cki_icls_t *icls, int sectorate)
+{
+       struct psdrate *prate;
+       u64 temp;
+       
+       down_read(&icls->sem);
+       list_for_each_entry(prate, &icls->rate_list, rate_list) {
+               temp = (u64) sectorate * prate->psd->ps_max_sectorate;
+               do_div(temp,ckid.rootsectorate);
+               atomic_set(&prate->psrate.sectorate,temp);
+       }       
+       up_read(&icls->sem);
+}
+
+/* Reset psdrate entries in icls for all current psd's 
+ * Called after a class's absolute shares change 
+ */
+void cki_reset_sectorate(cki_icls_t *icls)
+{
+       struct psdrate *prate;
+       u64 temp;
+       
+       down_read(&icls->sem);
+       list_for_each_entry(prate, &icls->rate_list, rate_list) {
+
+               if (icls->cnt_limit != CKRM_SHARE_DONTCARE) {
+                       temp = (u64) icls->cnt_limit * 
prate->psd->ps_max_sectorate;
+                       do_div(temp,ckid.rootsectorate);
+               } else 
+                       temp = prate->psd->ps_min_sectorate;
+               atomic_set(&prate->psrate.sectorate,temp);
+       }       
+       up_read(&icls->sem);
+
+}
+
+struct psdrate *dbprate;
+
+int cki_psdrate_init(struct ckrm_io_class *icls, struct ps_data *psd)
+{
+       struct psdrate *prate;
+       u64 temp;
+
+       prate = kmalloc(sizeof(struct psdrate),GFP_KERNEL);
+       if (!prate) 
+               return -ENOMEM;
+       
+       INIT_LIST_HEAD(&prate->rate_list);
+       prate->psd = psd;
+       memset(&prate->psrate,0,sizeof(prate->psrate));
+       
+       dbprate = prate;
+       if (icls->cnt_limit != CKRM_SHARE_DONTCARE) {
+               temp = (u64) icls->cnt_limit * psd->ps_max_sectorate;
+               do_div(temp,ckid.rootsectorate);
+       } else { 
+               temp = psd->ps_min_sectorate;
+       }
+       atomic_set(&prate->psrate.sectorate,temp);
+       
+       down_write(&icls->sem);
+       list_add(&prate->rate_list,&icls->rate_list);
+       up_write(&icls->sem);
+
+       return 0;
+}
+
+int cki_psdrate_del(struct ckrm_io_class *icls, struct ps_data *psd)
+{
+       struct psdrate *prate;
+
+       prate = cki_find_rate(icls, psd);
+       if (!prate) 
+               return 0;
+
+       down_write(&icls->sem);
+       list_del(&prate->rate_list);
+       up_write(&icls->sem);
+
+       kfree(prate);
+       return 0;
+}
+
+
+/* Create psdrate entries in icls for all current psd's */
+void cki_rates_init(cki_icls_t *icls)
+{
+       struct psd_list_entry *psdl;
+       
+       down_read(&psdlistsem);
+       list_for_each_entry(psdl,&ps_psdlist,psd_list) { 
+               if (cki_psdrate_init(icls, psdl->psd)) {
+                       printk(KERN_WARNING "%s: psdrate addition failed\n",
+                              __FUNCTION__);
+                       continue;
+               }
+       }
+       up_read(&psdlistsem);
+}
+
+/* Free all psdrate entries in icls */
+void cki_rates_del(cki_icls_t *icls)
+{
+       struct psdrate *prate, *tmp;
+       
+       down_write(&icls->sem);
+       list_for_each_entry_safe(prate, tmp, &icls->rate_list, rate_list) {
+           list_del(&prate->rate_list);
+           kfree(prate);
+       }
+       up_write(&icls->sem);
+/*     
+       down_read(&psdlistsem);
+       list_for_each_entry(psdl,&ps_psdlist,psd_list) { 
+               cki_psdrate_del(icls,psdl->psd);
+       }
+       up_read(&psdlistsem);
+*/
+}
+
+/* Called from ps-iosched.c when it initializes a new ps_data
+ *  as part of starting to manage a new device request queue 
+ */
+
+int cki_psd_init(struct ps_data *psd)
+{
+       struct ckrm_classtype *ctype = 
ckrm_classtypes[CKRM_CLASSTYPE_TASK_CLASS];
+       struct ckrm_core_class *core;
+       struct ckrm_io_class *icls;
+       struct psdrate *prate;
+       int ret=-ENOMEM;
+
+       /* Set psd's min and max sectorate from default values */
+       psd->ps_max_sectorate = ckid.rootsectorate;
+       psd->ps_min_sectorate = ckid.minsectorate;
+
+       down_read(&ckrm_class_sem);
+       list_for_each_entry(core, &ctype->classes, clslist) {
+               icls = ckrm_get_res_class(core, cki_rcbs.resid, cki_icls_t);
+               if (!icls)
+                       continue;
+
+               prate = cki_find_rate(icls, psd);
+               if (prate) 
+                       continue;
+
+               if (cki_psdrate_init(icls, psd)) {
+                       printk(KERN_WARNING "%s: psdrate addition failed\n",
+                              __FUNCTION__);
+                       continue;
+               }
+       }
+       ret = 0;
+
+       up_read(&ckrm_class_sem);
+       return ret;
+}
+EXPORT_SYMBOL_GPL(cki_psd_init);
+
+/* Called whenever ps-iosched frees a ps_data 
+ *  as part of ending management of a device request queue 
+ */
+
+int cki_psd_del(struct ps_data *psd)
+{
+       struct ckrm_classtype *ctype = 
ckrm_classtypes[CKRM_CLASSTYPE_TASK_CLASS];
+       struct ckrm_core_class *core;
+       struct ckrm_io_class *icls;
+       int ret = 0;
+
+       down_read(&ckrm_class_sem);
+       list_for_each_entry(core, &ctype->classes, clslist) {
+               icls = ckrm_get_res_class(core, cki_rcbs.resid, cki_icls_t);
+               if (!icls)
+                       continue;
+
+               if (cki_psdrate_del(icls,psd)) {
+                       printk(KERN_WARNING "%s: psdrate deletion failed\n",
+                              __FUNCTION__);
+                       continue;
+               }
+       }
+       up_read(&ckrm_class_sem);
+       return ret;
+}
+EXPORT_SYMBOL_GPL(cki_psd_del);
+
+struct ps_rate *cki_tsk_psrate(struct ps_data *psd, struct task_struct *tsk)
+{
+       cki_icls_t *icls;
+       struct psdrate *prate;
+
+       icls = ckrm_get_res_class(class_core(tsk->taskclass),
+                                 cki_rcbs.resid, cki_icls_t);
+       if (!icls)
+               return NULL;
+       
+       
+       prate = cki_find_rate(icls,psd);
+       if (prate)
+           return &(prate->psrate);
+       else
+           return NULL;
+}
+EXPORT_SYMBOL_GPL(cki_tsk_psrate);                     
+
+/* Exported functions end */
+
+
+#ifdef CKI_UNUSED
+static inline void cki_reset_stats(cki_stats_t *stats)
+{
+       if (stats) {
+               atomic_set(&stats->blkrd,0);
+               atomic_set(&stats->blkwr,0);
+       }
+}
+
+static inline void init_icls_stats(cki_icls_t *icls)
+{
+       struct timeval tv;
+
+       do_gettimeofday(&tv);
+       icls->stats.epochstart = icls->mystats.epochstart = tv;
+       icls->stats.blksz = icls->mystats.blksz = CKI_IOUSAGE_UNIT;
+       cki_reset_stats(&icls->stats);
+       cki_reset_stats(&icls->mystats);
+}      
+#endif
+
+/* Initialize icls to default values 
+ * No other classes touched, locks not reinitialized.
+ */
+
+static inline void init_icls_one(cki_icls_t *icls)
+{
+       /* Zero initial guarantee for scalable creation of
+          multiple classes */
+
+       /* Try out a new set */
+       
+       icls->shares.my_guarantee = CKRM_SHARE_DONTCARE;
+       icls->shares.my_limit = CKRM_SHARE_DONTCARE;
+       icls->shares.total_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
+       icls->shares.max_limit = CKRM_SHARE_DFLT_MAX_LIMIT;
+       icls->shares.unused_guarantee = icls->shares.total_guarantee;
+       icls->shares.cur_max_limit = 0;
+
+       icls->cnt_guarantee = CKRM_SHARE_DONTCARE;
+       icls->cnt_unused = CKRM_SHARE_DONTCARE;
+       icls->cnt_limit = CKRM_SHARE_DONTCARE;
+
+       INIT_LIST_HEAD(&icls->rate_list);
+#ifdef CKI_UNUSED      
+       init_icls_stats(icls);
+#endif
+}
+
+/* Initialize root's psd entries */
+static void cki_createrootrate(cki_icls_t *root, int sectorate)
+{
+       down_write(&root->sem);
+       root->cnt_guarantee = sectorate;
+       root->cnt_unused = sectorate;
+       root->cnt_limit = sectorate;
+       up_write(&root->sem);
+
+       cki_rates_init(root);
+}
+
+/* Called with root->share_lock held  */
+static void cki_setrootrate(cki_icls_t *root, int sectorate)
+{
+       down_write(&root->sem);
+       root->cnt_guarantee = sectorate;
+       root->cnt_unused = sectorate;
+       root->cnt_limit = sectorate;
+       up_write(&root->sem);
+
+       cki_reset_sectorate(root);
+}
+
+static void cki_put_psq(cki_icls_t *icls)
+{
+       struct psdrate *prate;
+       struct ckrm_task_class *tskcls;
+       
+       down_read(&icls->sem);
+       list_for_each_entry(prate, &icls->rate_list, rate_list) {
+               tskcls = container_of(icls->core,struct ckrm_task_class, core);
+               if (ps_drop_psq(prate->psd,(unsigned long)tskcls)) {
+                       printk(KERN_WARNING "%s: ps_icls_free failed\n",
+                              __FUNCTION__);
+                       continue;
+               }
+       }
+       up_read(&icls->sem);
+}
+
+static void *cki_alloc(struct ckrm_core_class *core,
+                        struct ckrm_core_class *parent)
+{
+       cki_icls_t *icls;
+       
+       icls = kmalloc(sizeof(cki_icls_t), GFP_ATOMIC);
+       if (!icls) {
+               printk(KERN_ERR "cki_res_alloc failed GFP_ATOMIC\n");
+               return NULL;
+       }
+
+       memset(icls, 0, sizeof(cki_icls_t));
+       icls->core = core;
+       icls->parent = parent;
+       init_rwsem(&icls->sem);
+
+       init_icls_one(icls);
+
+       if (parent == NULL) 
+               /* No need to acquire root->share_lock */
+               cki_createrootrate(icls, ckid.rootsectorate);
+       
+       
+       try_module_get(THIS_MODULE);
+       return icls;
+}
+
+static void cki_free(void *res)
+{
+       cki_icls_t *icls = res, *parres, *childres;
+       struct ckrm_core_class *child = NULL;
+       int maxlimit, resid = cki_rcbs.resid;
+
+       
+       if (!res)
+               return;
+
+       /* Deallocate CFQ queues */
+
+       /* Currently CFQ queues are deallocated when empty. Since no task 
+        * should belong to this icls, no new requests will get added to the
+        * CFQ queue. 
+        * 
+        * When CFQ switches to persistent queues, call its "put" function
+        * so it gets deallocated after the last pending request is serviced.
+        *
+        */
+
+       parres = ckrm_get_res_class(icls->parent, resid, cki_icls_t);
+       if (!parres) {
+               printk(KERN_ERR "cki_free: error getting "
+                      "resclass from core \n");
+               return;
+       }
+
+       /* Update parent's shares */
+       down_write(&parres->sem);
+
+       child_guarantee_changed(&parres->shares, icls->shares.my_guarantee, 0);
+       parres->cnt_unused += icls->cnt_guarantee;
+
+       // run thru parent's children and get the new max_limit of the parent
+       ckrm_lock_hier(parres->core);
+       maxlimit = 0;
+       while ((child = ckrm_get_next_child(parres->core, child)) != NULL) {
+               childres = ckrm_get_res_class(child, resid, cki_icls_t);
+               if (maxlimit < childres->shares.my_limit) {
+                       maxlimit = childres->shares.my_limit;
+               }
+       }
+       ckrm_unlock_hier(parres->core);
+       if (parres->shares.cur_max_limit < maxlimit) {
+               parres->shares.cur_max_limit = maxlimit;
+       }
+       up_write(&parres->sem);
+
+       /* Drop refcounts on all psq's corresponding to this class */
+       cki_put_psq(icls);
+       
+       cki_rates_del(icls);
+
+       kfree(res);
+       module_put(THIS_MODULE);
+       return;
+}
+
+
+/* Recalculate absolute shares from relative
+ * Caller should hold a lock on icls
+ */
+
+static void cki_recalc_propagate(cki_icls_t *res, cki_icls_t *parres)
+{
+
+       struct ckrm_core_class *child = NULL;
+       cki_icls_t *childres;
+       int resid = cki_rcbs.resid;
+       u64 temp;
+
+       if (parres) {
+               struct ckrm_shares *par = &parres->shares;
+               struct ckrm_shares *self = &res->shares;
+
+
+               if (parres->cnt_guarantee == CKRM_SHARE_DONTCARE) {
+                       res->cnt_guarantee = CKRM_SHARE_DONTCARE;
+               } else if (par->total_guarantee) {
+                       temp = (u64) self->my_guarantee * 
+                               parres->cnt_guarantee;
+                       do_div(temp, par->total_guarantee);
+                       res->cnt_guarantee = (int) temp;
+               } else {
+                       res->cnt_guarantee = 0;
+               }
+
+
+               if (parres->cnt_limit == CKRM_SHARE_DONTCARE) {
+                       res->cnt_limit = CKRM_SHARE_DONTCARE;
+                       cki_set_sectorate(res,ckid.minsectorate);
+               } else {
+                       if (par->max_limit) {
+                               temp = (u64) self->my_limit * 
+                                       parres->cnt_limit;
+                               do_div(temp, par->max_limit);
+                               res->cnt_limit = (int) temp;
+                       } else {
+                               res->cnt_limit = 0;
+                       }
+                       cki_set_sectorate(res,res->cnt_limit);
+               }
+               
+               if (res->cnt_guarantee == CKRM_SHARE_DONTCARE) {
+                       res->cnt_unused = CKRM_SHARE_DONTCARE;
+               } else {
+                       if (self->total_guarantee) {
+                               temp = (u64) self->unused_guarantee * 
+                                       res->cnt_guarantee;
+                               do_div(temp, self->total_guarantee);
+                               res->cnt_unused = (int) temp;
+                       } else {
+                               res->cnt_unused = 0;
+                       }
+
+               }
+               
+       }
+       // propagate to children
+       ckrm_lock_hier(res->core);
+       while ((child = ckrm_get_next_child(res->core,child)) != NULL){
+               childres = ckrm_get_res_class(child, resid, 
+                                             cki_icls_t);
+               
+               down_write(&childres->sem);
+               cki_recalc_propagate(childres, res);
+               up_write(&childres->sem);
+       }
+       ckrm_unlock_hier(res->core);
+}
+
+
+static int cki_setshare(void *res, struct ckrm_shares *new)
+{
+       cki_icls_t *icls = res, *parres;
+       struct ckrm_shares *cur, *par;
+       int rc = -EINVAL, resid = cki_rcbs.resid;
+
+       if (!icls) 
+               return rc;
+
+       cur = &icls->shares; 
+       if (icls->parent) {
+               parres =
+                   ckrm_get_res_class(icls->parent, resid, cki_icls_t);
+               if (!parres) {
+                       pr_debug("cki_setshare: invalid resclass\n");
+                       return -EINVAL;
+               }
+               down_write(&parres->sem);
+               down_write(&icls->sem);
+               par = &parres->shares;
+       } else {
+               down_write(&icls->sem);
+               parres = NULL;
+               par = NULL;
+       }
+
+       rc = set_shares(new, cur, par);
+
+       if ((!rc) && parres) {
+               if (parres->cnt_guarantee == CKRM_SHARE_DONTCARE) {
+                       parres->cnt_unused = CKRM_SHARE_DONTCARE;
+               } else if (par->total_guarantee) {
+                       u64 temp = (u64) par->unused_guarantee * 
+                               parres->cnt_guarantee;
+                       do_div(temp, par->total_guarantee);
+                       parres->cnt_unused = (int) temp;
+               } else {
+                       parres->cnt_unused = 0;
+               }
+               cki_recalc_propagate(res, parres);
+       }
+       up_write(&icls->sem);
+       if (icls->parent) {
+               up_write(&parres->sem);
+       }
+       return rc;
+}
+
+static int cki_getshare(void *res, struct ckrm_shares * shares)
+{
+       cki_icls_t *icls = res;
+
+       if (!icls)
+               return -EINVAL;
+       *shares = icls->shares;
+       return 0;
+}
+
+static int cki_getstats(void *res, struct seq_file *sfile)
+{
+       cki_icls_t *icls = res;
+       struct psdrate *prate;
+       char *path;
+               
+
+       if (!icls)
+               return -EINVAL;
+
+       seq_printf(sfile, "res=%s, abs limit %d\n",cki_rcbs.res_name,
+                  icls->cnt_limit);
+
+       down_read(&icls->sem);
+       list_for_each_entry(prate, &icls->rate_list, rate_list) {
+               path = kobject_get_path(&prate->psd->queue->kobj, GFP_KERNEL);
+               seq_printf(sfile,"%s skip %d timdout %d avsec %lu rate %d"
+                          " sec0 %lu sec1 %lu\n",
+                          path,
+                          prate->psrate.nskip,
+                          prate->psrate.timedout,
+                          prate->psrate.navsec,
+                          atomic_read(&(prate->psrate.sectorate)),
+                          (unsigned long)prate->psrate.sec[0],
+                          (unsigned long)prate->psrate.sec[1]);
+               kfree(path);
+       }
+       up_read(&icls->sem);
+       return 0;
+}
+
+static int cki_resetstats(void *res)
+{
+       cki_icls_t *icls = res;
+
+       if (!res)
+               return -EINVAL;
+       
+       init_icls_stats(icls);
+       return 0;
+}
+
+static void cki_chgcls(void *tsk, void *oldres, void *newres)
+{
+       /* cki_icls_t *oldicls = oldres, *newicls = newres; */
+       
+       /* Nothing needs to be done 
+        * Future requests from task will go to the new class's psq
+        * Old ones will continue to get satisfied from the original psq
+        * 
+        */
+       return;
+}
+
+enum iocfg_token_t {
+       ROOTRATE, MINRATE, IOCFGERR
+};
+
+/* Token matching for parsing input to this magic file */
+static match_table_t iocfg_tokens = {
+       {ROOTRATE, "rootsectorate=%d"},
+       {MINRATE,"minsectorate=%d"},
+       {IOCFGERR, NULL}
+};
+
+static int cki_recalc_abs(void)
+{
+       struct ckrm_core_class *root;
+       cki_icls_t *icls;
+
+       root = (cki_rcbs.classtype)->default_class;
+       icls = ckrm_get_res_class(root, cki_rcbs.resid, cki_icls_t);
+       if (!icls)
+               return -EINVAL;
+
+       down_write(&icls->sem);
+       cki_recalc_propagate(icls, NULL);
+       up_write(&icls->sem);
+       return 0;
+}
+
+       
+
+
+static int cki_showconfig(void *res, struct seq_file *sfile)
+{
+       cki_icls_t *icls = res;
+       struct cki_data tmp;
+
+       if (!icls)
+               return -EINVAL;
+
+       spin_lock(&ckid.cfglock);
+       tmp = ckid;
+       spin_unlock(&ckid.cfglock);
+
+       seq_printf(sfile, "rootsectorate = %d, minsectorate = %d\n",
+                  tmp.rootsectorate,
+                  tmp.minsectorate);
+       return 0;
+}
+       
+static int cki_setconfig(void *res, const char *cfgstr)
+{
+       char *p, *inpstr = cfgstr;
+       int tmp,rc = -EINVAL;
+       cki_icls_t *rooticls;
+
+
+       if (!cfgstr)
+               return -EINVAL;
+       
+       while ((p = strsep(&inpstr, ",")) != NULL) {
+
+               substring_t args[MAX_OPT_ARGS];
+               int token;
+
+               
+               if (!*p)
+                       continue;
+               
+               token = match_token(p, iocfg_tokens, args);
+               switch (token) {
+
+               case ROOTRATE: 
+                       if (match_int(args, &tmp))
+                               return -EINVAL;
+
+                       if (tmp < 0)
+                               return -EINVAL;
+
+                       spin_lock(&(ckid.cfglock));
+                       ckid.rootsectorate = tmp;
+                       spin_unlock(&(ckid.cfglock));
+                       
+                       rooticls = ckrm_get_res_class(
+                               (cki_rcbs.classtype)->default_class, 
+                               cki_rcbs.resid, cki_icls_t);
+
+                       cki_setrootrate(rooticls,tmp);
+                       /* update absolute shares treewide */
+                       rc = cki_recalc_abs();
+                       if (rc)
+                               return rc;
+                       break;
+
+               case MINRATE:
+                       if (match_int(args, &tmp))
+                               return -EINVAL;
+
+                       spin_lock(&(ckid.cfglock));
+                       if (tmp <= 0 || tmp > ckid.rootsectorate) {
+                               spin_unlock(&(ckid.cfglock));
+                               return -EINVAL;
+                       }
+                       ckid.minsectorate = tmp;
+                       spin_unlock(&(ckid.cfglock));
+                       
+                       /* update absolute shares treewide */
+                       rc = cki_recalc_abs();
+                       if (rc)
+                               return rc;
+                       break;
+
+               default:
+                       return -EINVAL;
+
+               }
+       }
+
+       return rc;
+}
+
+
+
+
+
+struct ckrm_res_ctlr cki_rcbs = {
+       .res_name = "io",
+       .res_hdepth = 1,
+       .resid = -1,
+       .res_alloc = cki_alloc,
+       .res_free = cki_free,
+       .set_share_values = cki_setshare,
+       .get_share_values = cki_getshare,
+       .get_stats = cki_getstats,
+       .reset_stats = cki_resetstats,
+       .show_config = cki_showconfig,
+       .set_config = cki_setconfig,
+       .change_resclass = cki_chgcls,
+};
+
+
+void __exit cki_exit(void)
+{
+       ckrm_unregister_res_ctlr(&cki_rcbs);
+       cki_rcbs.resid = -1;
+       cki_rcbs.classtype = NULL; 
+}
+
+int __init cki_init(void)
+{
+       struct ckrm_classtype *clstype;
+       int resid = cki_rcbs.resid;
+
+       if (resid != -1) 
+               return 0;
+
+       clstype = ckrm_find_classtype_by_name("taskclass");
+       if (clstype == NULL) {
+               printk(KERN_WARNING "%s: classtype<taskclass> not found\n",
+                      __FUNCTION__);
+               return -ENOENT;
+       }
+
+       ckid.cfglock = SPIN_LOCK_UNLOCKED;
+       ckid.rootsectorate = CKI_ROOTSECTORATE_DEF;
+       ckid.minsectorate = CKI_MINSECTORATE_DEF;
+
+       atomic_set(&cki_def_psrate.sectorate,0);
+       init_rwsem(&psdlistsem);
+       
+       resid = ckrm_register_res_ctlr(clstype, &cki_rcbs);
+       if (resid == -1) 
+               return -ENOENT;
+
+       cki_rcbs.classtype = clstype;
+       return 0;
+}
+       
+
+module_init(cki_init)
+module_exit(cki_exit)
+
+MODULE_AUTHOR("Shailabh Nagar <[EMAIL PROTECTED]>");
+MODULE_DESCRIPTION("CKRM Disk I/O Resource Controller");
+MODULE_LICENSE("GPL");
+
Index: linux-2.6.12-rc3/drivers/block/ps-iosched.c
===================================================================
--- linux-2.6.12-rc3.orig/drivers/block/ps-iosched.c
+++ linux-2.6.12-rc3/drivers/block/ps-iosched.c
@@ -22,7 +22,8 @@
 #include <linux/compiler.h>
 #include <linux/hash.h>
 #include <linux/rbtree.h>
-#include <linux/mempool.h>
+#include <linux/ckrm-io.h>
+#include <asm/div64.h>
 
 static unsigned long max_elapsed_prq;
 static unsigned long max_elapsed_dispatch;
@@ -39,6 +40,10 @@ static int ps_fifo_rate = HZ / 8;    /* fif
 static int ps_back_max = 16 * 1024;    /* maximum backwards seek, in KiB */
 static int ps_back_penalty = 2;        /* penalty of a backwards seek */
 
+#define PS_EPOCH               1000000000
+#define PS_HMAX_PCT            80
+
+
 /*
  * for the hash of psq inside the psd
  */
@@ -90,53 +95,20 @@ enum {
        PS_KEY_TGID,
        PS_KEY_UID,
        PS_KEY_GID,
+       PS_KEY_TASKCLASS,
        PS_KEY_LAST,
 };
 
-static char *ps_key_types[] = { "pgid", "tgid", "uid", "gid", NULL };
+
+
+static char *ps_key_types[] = { "pgid", "tgid", "uid", "gid", "taskclass", 
NULL };
 
 static kmem_cache_t *prq_pool;
 static kmem_cache_t *ps_pool;
 static kmem_cache_t *ps_ioc_pool;
 
-struct ps_data {
-       struct list_head rr_list;
-       struct list_head empty_list;
-
-       struct hlist_head *ps_hash;
-       struct hlist_head *prq_hash;
-
-       /* queues on rr_list (ie they have pending requests */
-       unsigned int busy_queues;
-
-       unsigned int max_queued;
-
-       atomic_t ref;
-
-       int key_type;
-
-       mempool_t *prq_pool;
-
-       request_queue_t *queue;
-
-       sector_t last_sector;
-
-       int rq_in_driver;
-
-       /*
-        * tunables, see top of file
-        */
-       unsigned int ps_quantum;
-       unsigned int ps_queued;
-       unsigned int ps_fifo_expire_r;
-       unsigned int ps_fifo_expire_w;
-       unsigned int ps_fifo_batch_expire;
-       unsigned int ps_back_penalty;
-       unsigned int ps_back_max;
-       unsigned int find_best_prq;
-
-       unsigned int ps_tagged;
-};
+extern struct rw_semaphore psdlistsem;
+extern struct list_head ps_psdlist;
 
 struct ps_queue {
        /* reference count */
@@ -175,6 +147,22 @@ struct ps_queue {
        int in_flight;
        /* number of currently allocated requests */
        int alloc_limit[2];
+
+       /* limit related settings/stats */
+       struct ps_rate *psrate; 
+
+       u64 epstart;            /* current epoch's starting timestamp (ns) */
+       u64 epsector[2];        /* Total sectors dispatched in [0] previous
+                                * and [1] current epoch
+                                */
+       unsigned long avsec;    /* avg sectors dispatched/epoch */
+       int skipped;            /* queue skipped at last dispatch ? */
+
+       /* Per queue timer to suspend/resume queue from processing */
+       struct timer_list timer;
+       unsigned long wait_end;
+       unsigned long flags;
+       struct work_struct work;
 };
 
 struct ps_rq {
@@ -200,6 +188,7 @@ static void ps_dispatch_sort(request_que
 static void ps_update_next_prq(struct ps_rq *);
 static void ps_put_psd(struct ps_data *psd);
 
+
 /*
  * what the fairness is based on (ie how processes are grouped and
  * differentiated)
@@ -220,6 +209,8 @@ ps_hash_key(struct ps_data *psd, struct 
                        return tsk->uid;
                case PS_KEY_GID:
                        return tsk->gid;
+               case PS_KEY_TASKCLASS:
+                       return (unsigned long) class_core(tsk->taskclass);
        }
 }
 
@@ -722,6 +713,81 @@ ps_merged_requests(request_queue_t *q, s
        ps_remove_request(q, next);
 }
 
+
+/* Over how many ns is sectorate defined */
+#define NS4SCALE  (100000000)
+
+struct ps_rq *dbprq;
+struct ps_queue *dbpsq;
+unsigned long dbsectorate;
+
+static void __ps_check_limit(struct ps_data *psd,struct ps_queue *psq, int 
dontskip)
+{
+       struct ps_rq *prq;
+       unsigned long long ts, gap, epoch, tmp;
+       unsigned long newavsec, sectorate;
+
+       prq = rb_entry_prq(rb_first(&psq->sort_list));
+
+       dbprq = prq;
+       dbpsq = psq;
+
+       ts = sched_clock();
+       gap = ts - psq->epstart;
+       epoch = psd->ps_epoch;
+
+       sectorate = atomic_read(&psq->psrate->sectorate);
+       dbsectorate = sectorate;
+
+       if ((gap >= epoch) || (gap < 0)) {
+
+               if (gap >= (epoch << 1)) {
+                       psq->epsector[0] = 0;
+                       psq->epstart = ts ; 
+               } else {
+                       psq->epsector[0] = psq->epsector[1];
+                       psq->epstart += epoch;
+               } 
+               psq->epsector[1] = 0;
+               gap = ts - psq->epstart;
+
+               tmp  = (psq->epsector[0] + prq->request->nr_sectors) * NS4SCALE;
+               do_div(tmp,epoch+gap);
+
+               psq->avsec = (unsigned long)tmp;
+               psq->skipped = 0;
+               psq->epsector[1] += prq->request->nr_sectors;
+               
+               psq->psrate->navsec = psq->avsec;
+               psq->psrate->sec[0] = psq->epsector[0];
+               psq->psrate->sec[1] = psq->epsector[1];
+               psq->psrate->timedout++;
+               return;
+       } else {
+               
+               tmp = (psq->epsector[0] + psq->epsector[1] + 
+                      prq->request->nr_sectors) * NS4SCALE;
+               do_div(tmp,epoch+gap);
+
+               newavsec = (unsigned long)tmp;
+               if ((newavsec < sectorate) || dontskip) {
+                       psq->avsec = newavsec ;
+                       psq->skipped = 0;
+                       psq->epsector[1] += prq->request->nr_sectors;
+                       psq->psrate->navsec = psq->avsec;
+                       psq->psrate->sec[1] = psq->epsector[1];
+               } else {
+                       psq->skipped = 1;
+                       /* pause q's processing till avsec drops to 
+                          ps_hmax_pct % of its value */
+                       tmp = (epoch+gap) * (100-psd->ps_hmax_pct);
+                       do_div(tmp,1000000*psd->ps_hmax_pct);
+                       psq->wait_end = jiffies+msecs_to_jiffies(tmp);
+               }
+       }       
+}
+
+
 /*
  * we dispatch psd->ps_quantum requests in total from the rr_list queues,
  * this function sector sorts the selected request to minimize seeks. we start
@@ -823,7 +889,7 @@ static int ps_dispatch_requests(request_
        struct ps_data *psd = q->elevator->elevator_data;
        struct ps_queue *psq;
        struct list_head *entry, *tmp;
-       int queued, busy_queues, first_round;
+       int queued, busy_queues, first_round, busy_unlimited;
 
        if (list_empty(&psd->rr_list))
                return 0;
@@ -831,24 +897,36 @@ static int ps_dispatch_requests(request_
        queued = 0;
        first_round = 1;
 restart:
+       busy_unlimited = 0;
        busy_queues = 0;
        list_for_each_safe(entry, tmp, &psd->rr_list) {
                psq = list_entry_psq(entry);
 
                BUG_ON(RB_EMPTY(&psq->sort_list));
+               busy_queues++;
+               
+               if (first_round || busy_unlimited)
+                       __ps_check_limit(psd,psq,0);
+               else
+                       __ps_check_limit(psd,psq,1);
 
-               /*
-                * first round of queueing, only select from queues that
-                * don't already have io in-flight
-                */
-               if (first_round && psq->in_flight)
+               if (psq->skipped) {
+                       psq->psrate->nskip++;
+                       busy_queues--;
+                       if (time_before(jiffies, psq->wait_end)) {
+                               list_del(&psq->ps_list);
+                               mod_timer(&psq->timer,psq->wait_end);
+                       }
                        continue;
+               }
+               busy_unlimited++;
 
                ps_dispatch_request(q, psd, psq);
 
-               if (!RB_EMPTY(&psq->sort_list))
-                       busy_queues++;
-
+               if (RB_EMPTY(&psq->sort_list)) {
+                       busy_unlimited--;
+                       busy_queues--;
+               }
                queued++;
        }
 
@@ -856,6 +934,19 @@ restart:
                first_round = 0;
                goto restart;
        }
+#if 0
+       } else {
+               /*
+                * if we hit the queue limit, put the string of serviced
+                * queues at the back of the pending list
+                */
+               struct list_head *prv = nxt->prev;
+               if (prv != plist) {
+                       list_del(plist);
+                       list_add(plist, prv);
+               }
+       }
+#endif
 
        return queued;
 }
@@ -961,6 +1052,25 @@ dispatch:
        return NULL;
 }
 
+void ps_set_sectorate(struct ckrm_core_class *core, int sectorate)
+{
+       struct ps_data *psd;
+       struct ps_queue *psq;
+       u64 temp;
+
+       down_read(&psdlistsem);
+       list_for_each_entry(psd, &ps_psdlist, psdlist) {
+               psq = ps_find_ps_hash(psd,(unsigned int)core);
+               
+               temp = (u64) sectorate * psd->ps_max_sectorate;
+               do_div(temp,ckid.rootsectorate);
+
+               atomic_set(&psq->psrate->sectorate, temp);
+       }
+       up_read(&psdlistsem);
+}
+
+
 /*
  * task holds one reference to the queue, dropped when task exits. each prq
  * in-flight on this queue also holds a reference, dropped when prq is freed.
@@ -1186,6 +1296,29 @@ err:
        return NULL;
 }
 
+
+static void ps_pauseq_timer(unsigned long data)
+{
+       struct ps_queue *psq = (struct ps_queue *) data;
+       kblockd_schedule_work(&psq->work);
+}
+
+static void ps_pauseq_work(void *data)
+{
+       struct ps_queue *psq = (struct ps_queue *) data;
+       struct ps_data *psd = psq->psd;
+       request_queue_t *q = psd->queue;
+       unsigned long flags;
+       
+       spin_lock_irqsave(q->queue_lock, flags);
+       list_add_tail(&psq->ps_list,&psd->rr_list);
+       psq->skipped = 0;
+       if (ps_next_request(q))
+               q->request_fn(q);
+       spin_unlock_irqrestore(q->queue_lock, flags);
+}      
+
+
 static struct ps_queue *
 __ps_get_queue(struct ps_data *psd, unsigned long key, int gfp_mask)
 {
@@ -1215,9 +1348,25 @@ retry:
                INIT_LIST_HEAD(&psq->fifo[0]);
                INIT_LIST_HEAD(&psq->fifo[1]);
 
+               psq->psrate = cki_tsk_psrate(psd,current);
+               if (!psq->psrate) {
+                   printk(KERN_WARNING "%s: psrate not found\n",__FUNCTION__);
+                   psq->psrate = &cki_def_psrate;
+               }
+
+               psq->epstart = sched_clock();
+               init_timer(&psq->timer);
+               psq->timer.function = ps_pauseq_timer;
+               psq->timer.data = (unsigned long) psq;
+               INIT_WORK(&psq->work, ps_pauseq_work, psq); 
+
+
                psq->key = key;
                hlist_add_head(&psq->ps_hash, &psd->ps_hash[hashval]);
-               atomic_set(&psq->ref, 0);
+               /* Refcount set to one to account for the CKRM class 
+                *  corresponding to this queue. 
+                */
+               atomic_set(&psq->ref, 1);
                psq->psd = psd;
                atomic_inc(&psd->ref);
                psq->key_type = psd->key_type;
@@ -1227,6 +1376,7 @@ retry:
        if (new_psq)
                kmem_cache_free(ps_pool, new_psq);
 
+       /* incr ref count for each request using the psq */
        atomic_inc(&psq->ref);
 out:
        WARN_ON((gfp_mask & __GFP_WAIT) && !psq);
@@ -1472,6 +1622,7 @@ out_lock:
        return 1;
 }
 
+
 static void ps_put_psd(struct ps_data *psd)
 {
        request_queue_t *q = psd->queue;
@@ -1479,6 +1630,7 @@ static void ps_put_psd(struct ps_data *p
        if (!atomic_dec_and_test(&psd->ref))
                return;
 
+       cki_psd_del(psd);
        blk_put_queue(q);
 
        mempool_destroy(psd->prq_pool);
@@ -1495,27 +1647,42 @@ static void ps_exit_queue(elevator_t *e)
 static int ps_init_queue(request_queue_t *q, elevator_t *e)
 {
        struct ps_data *psd;
-       int i;
+       struct psd_list_entry *psdl;
+       int i,rc;
 
        psd = kmalloc(sizeof(*psd), GFP_KERNEL);
        if (!psd)
                return -ENOMEM;
 
+       psdl = kmalloc(sizeof(*psdl), GFP_KERNEL);
+       if (!psdl)
+               goto out_psd;
+       INIT_LIST_HEAD(&psdl->psd_list);
+       psdl->psd = psd;
+
        memset(psd, 0, sizeof(*psd));
        INIT_LIST_HEAD(&psd->rr_list);
        INIT_LIST_HEAD(&psd->empty_list);
 
-       psd->prq_hash = kmalloc(sizeof(struct hlist_head) * PS_MHASH_ENTRIES, 
GFP_KERNEL);
+       rc = cki_psd_init(psd);
+       if (rc)
+               goto out_psdl;
+
+
+       psd->prq_hash = kmalloc(sizeof(struct hlist_head) * PS_MHASH_ENTRIES, 
+                               GFP_KERNEL);
        if (!psd->prq_hash)
-               goto out_prqhash;
+               goto out_psdl;
 
-       psd->ps_hash = kmalloc(sizeof(struct hlist_head) * PS_QHASH_ENTRIES, 
GFP_KERNEL);
+       psd->ps_hash = kmalloc(sizeof(struct hlist_head) * PS_QHASH_ENTRIES, 
+                              GFP_KERNEL);
        if (!psd->ps_hash)
-               goto out_pshash;
+               goto out_prqhash;
 
-       psd->prq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, 
mempool_free_slab, prq_pool);
+       psd->prq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, 
+                                      mempool_free_slab, prq_pool);
        if (!psd->prq_pool)
-               goto out_prqpool;
+               goto out_pshash;
 
        for (i = 0; i < PS_MHASH_ENTRIES; i++)
                INIT_HLIST_HEAD(&psd->prq_hash[i]);
@@ -1527,6 +1694,10 @@ static int ps_init_queue(request_queue_t
        psd->queue = q;
        atomic_inc(&q->refcnt);
 
+       down_write(&psdlistsem);
+       list_add(&psdl->psd_list,&ps_psdlist);
+       up_write(&psdlistsem);
+
        /*
         * just set it to some high value, we want anyone to be able to queue
         * some requests. fairness is handled differently
@@ -1546,12 +1717,18 @@ static int ps_init_queue(request_queue_t
        psd->ps_back_max = ps_back_max;
        psd->ps_back_penalty = ps_back_penalty;
 
+       psd->ps_epoch = PS_EPOCH;
+       psd->ps_hmax_pct = PS_HMAX_PCT;
+
+
        return 0;
-out_prqpool:
-       kfree(psd->ps_hash);
 out_pshash:
-       kfree(psd->prq_hash);
+       kfree(psd->ps_hash);
 out_prqhash:
+       kfree(psd->prq_hash);
+out_psdl:
+       kfree(psdl);
+out_psd:
        kfree(psd);
        return -ENOMEM;
 }
@@ -1589,6 +1766,17 @@ fail:
        return -ENOMEM;
 }
 
+/* Exported functions */
+int ps_drop_psq(struct ps_data *psd, unsigned long key)
+{
+       struct ps_queue *psq = ps_find_ps_hash(psd, key);
+       if (!psq)
+               return -1;
+
+       ps_put_queue(psq);
+       return 0;
+}
+EXPORT_SYMBOL(ps_drop_psq);
 
 /*
  * sysfs parts below -->
@@ -1633,6 +1821,8 @@ ps_set_key_type(struct ps_data *psd, con
                psd->key_type = PS_KEY_UID;
        else if (!strncmp(page, "gid", 3))
                psd->key_type = PS_KEY_GID;
+       else if (!strncmp(page, "taskclass", 3))
+               psd->key_type = PS_KEY_TASKCLASS;
        spin_unlock_irq(psd->queue->queue_lock);
        return count;
 }
@@ -1654,7 +1844,7 @@ ps_read_key_type(struct ps_data *psd, ch
 }
 
 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV)                           \
-static ssize_t __FUNC(struct ps_data *psd, char *page)         \
+static ssize_t __FUNC(struct ps_data *psd, char *page)                 \
 {                                                                      \
        unsigned int __data = __VAR;                                    \
        if (__CONV)                                                     \
@@ -1669,6 +1859,10 @@ SHOW_FUNCTION(ps_fifo_batch_expire_show,
 SHOW_FUNCTION(ps_find_best_show, psd->find_best_prq, 0);
 SHOW_FUNCTION(ps_back_max_show, psd->ps_back_max, 0);
 SHOW_FUNCTION(ps_back_penalty_show, psd->ps_back_penalty, 0);
+SHOW_FUNCTION(ps_epoch_show, psd->ps_epoch,0);
+SHOW_FUNCTION(ps_hmax_pct_show, psd->ps_hmax_pct,0);
+SHOW_FUNCTION(ps_max_sectorate_show, psd->ps_max_sectorate,0);
+SHOW_FUNCTION(ps_min_sectorate_show, psd->ps_min_sectorate,0);
 #undef SHOW_FUNCTION
 
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)                        
\
@@ -1694,6 +1888,10 @@ STORE_FUNCTION(ps_fifo_batch_expire_stor
 STORE_FUNCTION(ps_find_best_store, &psd->find_best_prq, 0, 1, 0);
 STORE_FUNCTION(ps_back_max_store, &psd->ps_back_max, 0, UINT_MAX, 0);
 STORE_FUNCTION(ps_back_penalty_store, &psd->ps_back_penalty, 1, UINT_MAX, 0);
+STORE_FUNCTION(ps_epoch_store, &psd->ps_epoch, 0, INT_MAX,0);
+STORE_FUNCTION(ps_hmax_pct_store, &psd->ps_hmax_pct, 1, 100,0);
+STORE_FUNCTION(ps_max_sectorate_store, &psd->ps_max_sectorate, 0, INT_MAX,0);
+STORE_FUNCTION(ps_min_sectorate_store, &psd->ps_min_sectorate, 0, INT_MAX,0);
 #undef STORE_FUNCTION
 
 static struct ps_fs_entry ps_quantum_entry = {
@@ -1745,6 +1943,27 @@ static struct ps_fs_entry ps_key_type_en
        .show = ps_read_key_type,
        .store = ps_set_key_type,
 };
+static struct ps_fs_entry ps_epoch_entry = {
+       .attr = {.name = "epoch", .mode = S_IRUGO | S_IWUSR },
+       .show = ps_epoch_show,
+       .store = ps_epoch_store,
+};
+static struct ps_fs_entry ps_hmax_pct_entry = {
+       .attr = {.name = "hmaxpct", .mode = S_IRUGO | S_IWUSR },
+       .show = ps_hmax_pct_show,
+       .store = ps_hmax_pct_store,
+};
+static struct ps_fs_entry ps_max_sectorate_entry = {
+       .attr = {.name = "max_sectorate", .mode = S_IRUGO | S_IWUSR },
+       .show = ps_max_sectorate_show,
+       .store = ps_max_sectorate_store,
+};
+static struct ps_fs_entry ps_min_sectorate_entry = {
+       .attr = {.name = "min_sectorate", .mode = S_IRUGO | S_IWUSR },
+       .show = ps_min_sectorate_show,
+       .store = ps_min_sectorate_store,
+};
+
 
 static struct attribute *default_attrs[] = {
        &ps_quantum_entry.attr,
@@ -1757,6 +1976,10 @@ static struct attribute *default_attrs[]
        &ps_back_max_entry.attr,
        &ps_back_penalty_entry.attr,
        &ps_clear_elapsed_entry.attr,
+       &ps_epoch_entry.attr,
+       &ps_hmax_pct_entry.attr,
+       &ps_max_sectorate_entry.attr,
+       &ps_min_sectorate_entry.attr,
        NULL,
 };
 
Index: linux-2.6.12-rc3/include/linux/ckrm-io.h
===================================================================
--- /dev/null
+++ linux-2.6.12-rc3/include/linux/ckrm-io.h
@@ -0,0 +1,134 @@
+#ifndef _LINUX_CKRM_IO_H
+#define _LINUX_CKRM_IO_H
+
+
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/mempool.h>
+#include <linux/ckrm_rc.h>
+#include <linux/ckrm_tc.h>
+
+
+/* root's default sectorate value which
+ * also serves as base for absolute shares.
+ * Configurable through taskclass' config file. 
+ */
+struct cki_data {
+       /* Protects both */
+       spinlock_t cfglock; 
+       /* root's absolute shares serve as base for other classes */
+       int rootsectorate;
+       /* absolute share assigned when relative share is "don't care" */ 
+       int minsectorate;
+};
+
+
+struct ps_data {
+       struct list_head rr_list;
+       struct list_head empty_list;
+
+       struct hlist_head *ps_hash;
+       struct hlist_head *prq_hash;
+
+       struct list_head psdlist;
+
+
+
+       /* queues on rr_list (ie they have pending requests */
+       unsigned int busy_queues;
+
+       unsigned int max_queued;
+
+       atomic_t ref;
+
+       int key_type;
+
+       mempool_t *prq_pool;
+
+       request_queue_t *queue;
+
+       sector_t last_sector;
+
+       int rq_in_driver;
+
+       /*
+        * tunables, see top of file
+        */
+       unsigned int ps_quantum;
+       unsigned int ps_queued;
+       unsigned int ps_fifo_expire_r;
+       unsigned int ps_fifo_expire_w;
+       unsigned int ps_fifo_batch_expire;
+       unsigned int ps_back_penalty;
+       unsigned int ps_back_max;
+       unsigned int find_best_prq;
+
+       unsigned int ps_tagged;
+
+       /* duration over which sectorates enforced */
+       unsigned int ps_epoch;
+       /* low-water mark (%) for resuming service of overshare ps_queues */
+       unsigned int ps_hmax_pct;
+       /* total sectors that queue can sustain */
+       unsigned int ps_max_sectorate; 
+       /* absolute sectorate when share is a "dontcare" */
+       unsigned int ps_min_sectorate;
+
+};
+
+/* For linking all psd's of ps-iosched */
+struct psd_list_entry {
+       struct list_head psd_list;
+       struct ps_data *psd;
+};
+
+/* Data for regulating sectors served */
+struct ps_rate {
+       int nskip;
+       unsigned long navsec;
+       int timedout;
+       atomic_t sectorate;
+       u64 sec[2];
+};
+
+/* To maintain psrate data structs for each
+   request queue managed by ps-iosched */
+
+struct psdrate {
+    struct list_head rate_list;
+    struct ps_data *psd;
+    struct ps_rate psrate;
+};
+
+extern struct ckrm_res_ctlr cki_rcbs;
+extern struct cki_data ckid;
+extern struct ps_rate cki_def_psrate; 
+
+extern struct rw_semaphore psdlistsem;
+extern struct list_head ps_psdlist;
+
+
+
+int cki_psd_init(struct ps_data *);
+int cki_psd_del(struct ps_data *);
+struct ps_rate *cki_tsk_psrate(struct ps_data *, struct task_struct *); 
+
+
+
+#if 0
+typedef void *(*icls_tsk_t) (struct task_struct *tsk);
+typedef int (*icls_ioprio_t) (struct task_struct *tsk);
+
+
+#ifdef CONFIG_CKRM_RES_BLKIO
+
+extern void *cki_tsk_icls (struct task_struct *tsk);
+extern int cki_tsk_ioprio (struct task_struct *tsk);
+extern void *cki_tsk_cfqpriv (struct task_struct *tsk);
+
+#endif /* CONFIG_CKRM_RES_BLKIO */
+
+#endif
+
+
+#endif 
Index: linux-2.6.12-rc3/include/linux/proc_fs.h
===================================================================
--- linux-2.6.12-rc3.orig/include/linux/proc_fs.h
+++ linux-2.6.12-rc3/include/linux/proc_fs.h
@@ -93,6 +93,7 @@ struct dentry *proc_pid_lookup(struct in
 struct dentry *proc_pid_unhash(struct task_struct *p);
 void proc_pid_flush(struct dentry *proc_dentry);
 int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir);
+int proc_pid_delay(struct task_struct *task, char * buffer);
 unsigned long task_vsize(struct mm_struct *);
 int task_statm(struct mm_struct *, int *, int *, int *, int *);
 char *task_mem(struct mm_struct *, char *);
Index: linux-2.6.12-rc3/init/Kconfig
===================================================================
--- linux-2.6.12-rc3.orig/init/Kconfig
+++ linux-2.6.12-rc3/init/Kconfig
@@ -182,6 +182,19 @@ config CKRM_TYPE_TASKCLASS
        
          Say Y if unsure
 
+config CKRM_RES_BLKIO
+       tristate " Disk I/O Resource Controller"
+       depends on CKRM_TYPE_TASKCLASS && IOSCHED_CFQ
+       default m
+       help
+         Provides a resource controller for best-effort block I/O 
+         bandwidth control. The controller attempts this by proportional 
+         servicing of requests in the I/O scheduler. However, seek
+         optimizations and reordering by device drivers/disk controllers may
+         alter the actual bandwidth delivered to a class.
+       
+         Say N if unsure, Y to use the feature.
+
 config CKRM_TYPE_SOCKETCLASS
        bool "Class Manager for socket groups"
        depends on CKRM && RCFS_FS


-------------------------------------------------------
This SF.Net email is sponsored by Oracle Space Sweepstakes
Want to be the first software developer in space?
Enter now for the Oracle Space Sweepstakes!
http://ads.osdn.com/?ad_id=7393&alloc_id=16281&op=click
_______________________________________________
ckrm-tech mailing list
https://lists.sourceforge.net/lists/listinfo/ckrm-tech

Reply via email to