Apply the io-throttle controller to the opportune kernel functions. Both
accounting and throttling functionalities are performed by
cgroup_io_throttle().

Signed-off-by: Andrea Righi <[EMAIL PROTECTED]>
---
 block/blk-core.c      |    2 ++
 fs/aio.c              |   31 ++++++++++++++++++++++++++++++-
 fs/buffer.c           |   20 +++++++++++++++++---
 fs/direct-io.c        |    4 ++++
 include/linux/sched.h |    3 +++
 kernel/fork.c         |    3 +++
 mm/filemap.c          |   18 +++++++++++++++++-
 mm/page-writeback.c   |   30 +++++++++++++++++++++++++++---
 mm/readahead.c        |    5 +++++
 9 files changed, 108 insertions(+), 8 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 4c222ba..bffce33 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -26,6 +26,7 @@
 #include <linux/swap.h>
 #include <linux/writeback.h>
 #include <linux/task_io_accounting_ops.h>
+#include <linux/blk-io-throttle.h>
 #include <linux/interrupt.h>
 #include <linux/cpu.h>
 #include <linux/blktrace_api.h>
@@ -1483,6 +1484,7 @@ void submit_bio(int rw, struct bio *bio)
                        count_vm_events(PGPGOUT, count);
                } else {
                        task_io_account_read(bio->bi_size);
+                       cgroup_io_throttle(bio->bi_bdev, bio->bi_size, 1);
                        count_vm_events(PGPGIN, count);
                }
 
diff --git a/fs/aio.c b/fs/aio.c
index 0051fd9..1f3abb3 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -22,6 +22,7 @@
 #include <linux/sched.h>
 #include <linux/fs.h>
 #include <linux/file.h>
+#include <linux/blk-io-throttle.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
 #include <linux/slab.h>
@@ -1558,6 +1559,8 @@ static int io_submit_one(struct kioctx *ctx, struct iocb 
__user *user_iocb,
 {
        struct kiocb *req;
        struct file *file;
+       struct block_device *bdev;
+       struct inode *inode;
        ssize_t ret;
 
        /* enforce forwards compatibility on users */
@@ -1580,10 +1583,26 @@ static int io_submit_one(struct kioctx *ctx, struct 
iocb __user *user_iocb,
        if (unlikely(!file))
                return -EBADF;
 
+       /*
+        * Pre-account AIO activity: we over-account *all* the bytes here;
+        * bytes read from the page cache and bytes written in already dirtied
+        * pages (that do not generate real i/o on block devices) will be
+        * subtracted later, following the path of aio_run_iocb().
+        */
+       inode = file->f_mapping->host;
+       bdev = inode->i_sb->s_bdev;
+       ret = cgroup_io_throttle(bdev, iocb->aio_nbytes, 0);
+       if (unlikely(ret)) {
+               fput(file);
+               ret = -EAGAIN;
+               goto out_cgroup_io_throttle;
+       }
+
        req = aio_get_req(ctx);         /* returns with 2 references to req */
        if (unlikely(!req)) {
                fput(file);
-               return -EAGAIN;
+               ret = -EAGAIN;
+               goto out_cgroup_io_throttle;
        }
        req->ki_filp = file;
        if (iocb->aio_flags & IOCB_FLAG_RESFD) {
@@ -1622,12 +1641,14 @@ static int io_submit_one(struct kioctx *ctx, struct 
iocb __user *user_iocb,
                goto out_put_req;
 
        spin_lock_irq(&ctx->ctx_lock);
+       set_in_aio();
        aio_run_iocb(req);
        if (!list_empty(&ctx->run_list)) {
                /* drain the run list */
                while (__aio_run_iocbs(ctx))
                        ;
        }
+       unset_in_aio();
        spin_unlock_irq(&ctx->ctx_lock);
        aio_put_req(req);       /* drop extra ref to req */
        return 0;
@@ -1635,6 +1656,8 @@ static int io_submit_one(struct kioctx *ctx, struct iocb 
__user *user_iocb,
 out_put_req:
        aio_put_req(req);       /* drop extra ref to req */
        aio_put_req(req);       /* drop i/o ref to req */
+out_cgroup_io_throttle:
+       cgroup_io_throttle(bdev, -iocb->aio_nbytes, 0);
        return ret;
 }
 
@@ -1746,6 +1769,12 @@ asmlinkage long sys_io_cancel(aio_context_t ctx_id, 
struct iocb __user *iocb,
        ret = -EAGAIN;
        kiocb = lookup_kiocb(ctx, iocb, key);
        if (kiocb && kiocb->ki_cancel) {
+               struct block_device *bdev;
+               struct inode *inode = kiocb->ki_filp->f_mapping->host;
+
+               bdev = inode->i_sb->s_bdev;
+               cgroup_io_throttle(bdev, -kiocb->ki_nbytes, 0);
+
                cancel = kiocb->ki_cancel;
                kiocb->ki_users ++;
                kiocbSetCancelled(kiocb);
diff --git a/fs/buffer.c b/fs/buffer.c
index 4ffb5bb..89808b1 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -35,6 +35,7 @@
 #include <linux/suspend.h>
 #include <linux/buffer_head.h>
 #include <linux/task_io_accounting_ops.h>
+#include <linux/blk-io-throttle.h>
 #include <linux/bio.h>
 #include <linux/notifier.h>
 #include <linux/cpu.h>
@@ -708,11 +709,14 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
 static int __set_page_dirty(struct page *page,
                struct address_space *mapping, int warn)
 {
+       ssize_t cgroup_io_acct = 0;
+       int ret = 0;
+
        if (unlikely(!mapping))
                return !TestSetPageDirty(page);
 
        if (TestSetPageDirty(page))
-               return 0;
+               goto out;
 
        spin_lock_irq(&mapping->tree_lock);
        if (page->mapping) {    /* Race with truncate? */
@@ -723,14 +727,24 @@ static int __set_page_dirty(struct page *page,
                        __inc_bdi_stat(mapping->backing_dev_info,
                                        BDI_RECLAIMABLE);
                        task_io_account_write(PAGE_CACHE_SIZE);
+                       cgroup_io_acct = PAGE_CACHE_SIZE;
                }
                radix_tree_tag_set(&mapping->page_tree,
                                page_index(page), PAGECACHE_TAG_DIRTY);
        }
        spin_unlock_irq(&mapping->tree_lock);
        __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
-
-       return 1;
+       ret = 1;
+out:
+       if (is_in_aio() && !cgroup_io_acct)
+               cgroup_io_acct = -PAGE_CACHE_SIZE;
+       if (cgroup_io_acct) {
+               struct block_device *bdev = (mapping->host &&
+               mapping->host->i_sb->s_bdev) ?
+               mapping->host->i_sb->s_bdev : NULL;
+               cgroup_io_throttle(bdev, cgroup_io_acct, 0);
+       }
+       return ret;
 }
 
 /*
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 9606ee8..f5dcb91 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -35,6 +35,7 @@
 #include <linux/buffer_head.h>
 #include <linux/rwsem.h>
 #include <linux/uio.h>
+#include <linux/blk-io-throttle.h>
 #include <asm/atomic.h>
 
 /*
@@ -660,6 +661,9 @@ submit_page_section(struct dio *dio, struct page *page,
                /*
                 * Read accounting is performed in submit_bio()
                 */
+               struct block_device *bdev = dio->bio ?
+                                       dio->bio->bi_bdev : NULL;
+               cgroup_io_throttle(bdev, len, 1);
                task_io_account_write(len);
        }
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ba43675..9d4c755 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1250,6 +1250,9 @@ struct task_struct {
        u64 rchar, wchar, syscr, syscw;
 #endif
        struct task_io_accounting ioac;
+#ifdef CONFIG_CGROUP_IO_THROTTLE
+       atomic_t in_aio;
+#endif
 #if defined(CONFIG_TASK_XACCT)
        u64 acct_rss_mem1;      /* accumulated rss usage */
        u64 acct_vm_mem1;       /* accumulated virtual memory usage */
diff --git a/kernel/fork.c b/kernel/fork.c
index aed1ff7..f8cf5da 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1029,6 +1029,9 @@ static struct task_struct *copy_process(unsigned long 
clone_flags,
        task_io_accounting_init(p);
        acct_clear_integrals(p);
 
+#ifdef CONFIG_CGROUP_IO_THROTTLE
+       atomic_set(&p->in_aio, 0);
+#endif
        p->it_virt_expires = cputime_zero;
        p->it_prof_expires = cputime_zero;
        p->it_sched_expires = 0;
diff --git a/mm/filemap.c b/mm/filemap.c
index 7567d86..bb80789 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -13,6 +13,7 @@
 #include <linux/slab.h>
 #include <linux/compiler.h>
 #include <linux/fs.h>
+#include <linux/blk-io-throttle.h>
 #include <linux/uaccess.h>
 #include <linux/aio.h>
 #include <linux/capability.h>
@@ -1011,6 +1012,7 @@ static void do_generic_file_read(struct file *filp, 
loff_t *ppos,
        pgoff_t prev_index;
        unsigned long offset;      /* offset into pagecache page */
        unsigned int prev_offset;
+       int was_page_ok = 0;
        int error;
 
        index = *ppos >> PAGE_CACHE_SHIFT;
@@ -1023,7 +1025,8 @@ static void do_generic_file_read(struct file *filp, 
loff_t *ppos,
                struct page *page;
                pgoff_t end_index;
                loff_t isize;
-               unsigned long nr, ret;
+               ssize_t nr;
+               unsigned long ret;
 
                cond_resched();
 find_page:
@@ -1051,6 +1054,8 @@ find_page:
                                                                desc, offset))
                                goto page_not_up_to_date_locked;
                        unlock_page(page);
+               } else {
+                       was_page_ok = 1;
                }
 page_ok:
                /*
@@ -1080,6 +1085,17 @@ page_ok:
                }
                nr = nr - offset;
 
+               /*
+                * De-account i/o in case of AIO read from the page cache.
+                * AIO accounting was performed in io_submit_one().
+                */
+               if (is_in_aio() && was_page_ok) {
+                       struct block_device *bdev = (inode &&
+                                               inode->i_sb->s_bdev) ?
+                                               inode->i_sb->s_bdev : NULL;
+                       cgroup_io_throttle(bdev, -nr, 0);
+               }
+
                /* If users can be writing to this page using arbitrary
                 * virtual addresses, take care about potential aliasing
                 * before reading the page on the kernel side.
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 29b1d1e..c6207de 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -23,6 +23,7 @@
 #include <linux/init.h>
 #include <linux/backing-dev.h>
 #include <linux/task_io_accounting_ops.h>
+#include <linux/blk-io-throttle.h>
 #include <linux/blkdev.h>
 #include <linux/mpage.h>
 #include <linux/rmap.h>
@@ -430,6 +431,9 @@ static void balance_dirty_pages(struct address_space 
*mapping)
        unsigned long write_chunk = sync_writeback_pages();
 
        struct backing_dev_info *bdi = mapping->backing_dev_info;
+       struct block_device *bdev = (mapping->host &&
+                                       mapping->host->i_sb->s_bdev) ?
+                                       mapping->host->i_sb->s_bdev : NULL;
 
        for (;;) {
                struct writeback_control wbc = {
@@ -512,6 +516,14 @@ static void balance_dirty_pages(struct address_space 
*mapping)
                return;         /* pdflush is already working this queue */
 
        /*
+        * Apply the cgroup i/o throttling limitations. The accounting of write
+        * activity in page cache is performed in __set_page_dirty(), but since
+        * we cannot sleep there, 0 bytes are accounted here and the function
+        * is invoked only for throttling purpose.
+        */
+       cgroup_io_throttle(bdev, 0, 1);
+
+       /*
         * In laptop mode, we wait until hitting the higher threshold before
         * starting background writeout, and then write out all the way down
         * to the lower threshold.  So slow writers cause minimal disk activity.
@@ -1074,8 +1086,11 @@ int __set_page_dirty_no_writeback(struct page *page)
  */
 int __set_page_dirty_nobuffers(struct page *page)
 {
+       struct address_space *mapping = page_mapping(page);
+       ssize_t cgroup_io_acct = 0;
+       int ret = 0;
+
        if (!TestSetPageDirty(page)) {
-               struct address_space *mapping = page_mapping(page);
                struct address_space *mapping2;
 
                if (!mapping)
@@ -1091,6 +1106,7 @@ int __set_page_dirty_nobuffers(struct page *page)
                                __inc_bdi_stat(mapping->backing_dev_info,
                                                BDI_RECLAIMABLE);
                                task_io_account_write(PAGE_CACHE_SIZE);
+                               cgroup_io_acct = PAGE_CACHE_SIZE;
                        }
                        radix_tree_tag_set(&mapping->page_tree,
                                page_index(page), PAGECACHE_TAG_DIRTY);
@@ -1100,9 +1116,17 @@ int __set_page_dirty_nobuffers(struct page *page)
                        /* !PageAnon && !swapper_space */
                        __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
                }
-               return 1;
+               ret = 1;
        }
-       return 0;
+       if (is_in_aio() && !cgroup_io_acct)
+               cgroup_io_acct = -PAGE_CACHE_SIZE;
+       if (cgroup_io_acct) {
+               struct block_device *bdev = (mapping->host &&
+                                       mapping->host->i_sb->s_bdev) ?
+                                       mapping->host->i_sb->s_bdev : NULL;
+               cgroup_io_throttle(bdev, cgroup_io_acct, 0);
+       }
+       return ret;
 }
 EXPORT_SYMBOL(__set_page_dirty_nobuffers);
 
diff --git a/mm/readahead.c b/mm/readahead.c
index 137bc56..448f065 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -14,6 +14,7 @@
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
 #include <linux/task_io_accounting_ops.h>
+#include <linux/blk-io-throttle.h>
 #include <linux/pagevec.h>
 #include <linux/pagemap.h>
 
@@ -58,6 +59,9 @@ int read_cache_pages(struct address_space *mapping, struct 
list_head *pages,
                        int (*filler)(void *, struct page *), void *data)
 {
        struct page *page;
+       struct block_device *bdev =
+               (mapping->host && mapping->host->i_sb->s_bdev) ?
+               mapping->host->i_sb->s_bdev : NULL;
        int ret = 0;
 
        while (!list_empty(pages)) {
@@ -76,6 +80,7 @@ int read_cache_pages(struct address_space *mapping, struct 
list_head *pages,
                        break;
                }
                task_io_account_read(PAGE_CACHE_SIZE);
+               cgroup_io_throttle(bdev, PAGE_CACHE_SIZE, 1);
        }
        return ret;
 }
-- 
1.5.4.3

_______________________________________________
Containers mailing list
[EMAIL PROTECTED]
https://lists.linux-foundation.org/mailman/listinfo/containers

_______________________________________________
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel

Reply via email to