[RFC v1] add new io-scheduler to use cgroup on high-speed device
We want to use blkio.cgroup on high-speed device (like fusionio) for our mysql clusters. After testing different io-scheduler, we found that cfq is too slow and deadline can't run on cgroup. So we developed a new io-scheduler: tpps (Tiny Parallel Proportion Scheduler).It dispatch requests only by using their individual weight and total weight (proportion) therefore it's simply and efficient. Test case: fusionio card, 4 cgroups, iodepth-512 groupname weight test1 1000 test2 800 test3 600 test4 400 Use tpps, the result is: groupname iopsavg-rt(ms) max-rt(ms) test1 30220 16 54 test2 28261 18 56 test3 26333 19 69 test4 20152 25 87 Use cfq, the result is: groupname iopsavg-rt(ms) max-rt(ms) test1 16478 30 242 test2 13015 39 347 test3 9300 54 371 test4 5806 87 393 Signed-off-by: Robin Dong Signed-off-by: Zhu Yanhai Cc: Tejun Heo Cc: Vivek Goyal Cc: Jens Axboe Cc: Tao Ma --- block/Kconfig.iosched | 13 + block/Makefile |1 + block/tpps-iosched.c | 1272 include/linux/blkdev.h |2 +- 4 files changed, 1287 insertions(+), 1 deletions(-) create mode 100644 block/tpps-iosched.c diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 421bef9..e5e28c2 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -21,6 +21,16 @@ config IOSCHED_DEADLINE a new point in the service tree and doing a batch of IO from there in case of expiry. +config IOSCHED_TPPS + tristate "TPPS I/O scheduler" + # If BLK_CGROUP is a module, TPPS has to be built as module. + default y + ---help--- + The TPPS I/O scheduler tries to distribute iops proportional + among all cgroups in the system. It should also provide a low + latency working environment, suitable for flash-based device. + Note: If BLK_CGROUP=m, then TPPS can be built only as module. + config IOSCHED_CFQ tristate "CFQ I/O scheduler" default y @@ -49,6 +59,9 @@ choice config DEFAULT_DEADLINE bool "Deadline" if IOSCHED_DEADLINE=y + config DEFAULT_TPPS + bool "Tiny Parallel Proportion" if IOSCHED_TPPS=y + config DEFAULT_CFQ bool "CFQ" if IOSCHED_CFQ=y diff --git a/block/Makefile b/block/Makefile index 39b76ba..6e30ef4 100644 --- a/block/Makefile +++ b/block/Makefile @@ -15,6 +15,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o +obj-$(CONFIG_IOSCHED_TPPS) += tpps-iosched.o obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o obj-$(CONFIG_BLK_DEV_INTEGRITY)+= blk-integrity.o diff --git a/block/tpps-iosched.c b/block/tpps-iosched.c new file mode 100644 index 000..981fde2 --- /dev/null +++ b/block/tpps-iosched.c @@ -0,0 +1,1272 @@ +/* + * TPPS, or Tiny Parallel Proportion disk Scheduler. + * + * Based on ideas from Zhu Yanhai + * + * Copyright (C) 2013 Robin Dong + */ +#include +#include +#include +#include +#include +#include +#include +#include "blk-cgroup.h" +#include "blk.h" + +static struct kmem_cache *tpps_pool; + +struct tpps_queue { + /* reference count */ + int ref; + /* parent tpps_data */ + struct tpps_data *tppd; + /* tpps_group member */ + struct list_head tppg_node; + /* sorted list of pending requests */ + struct list_head sort_list; + struct tpps_group *tppg; + pid_t pid; + int online; + int rq_queued; +}; + +struct tppg_stats { + /* total bytes transferred */ + struct blkg_rwstat service_bytes; + /* total IOs serviced, post merge */ + struct blkg_rwstat serviced; + /* number of ios merged */ + struct blkg_rwstat merged; + /* total time spent on device in ns, may not be accurate w/ queueing */ + struct blkg_rwstat service_time; + /* total time spent waiting in scheduler queue in ns */ + struct blkg_rwstat wait_time; + /* number of IOs queued up */ + struct blkg_rwstat queued; + /* total sectors transferred */ + struct blkg_statsectors; + /* total disk time and nr sectors dispatched by this group */ + struct blkg_stattime; +}; + +struct tpps_group { + struct blkg_policy_data pd; + /* tpps_data member */ + struct list_head tppd_node; + struct list_head *cur_dispatcher; + + unsigned int weight; + unsigned int new_weight; + unsigned int dev_weight; + unsigned int l
[RFC v1] add new io-scheduler to use cgroup on high-speed device
From: Robin Dong We want to use blkio.cgroup on high-speed device (like fusionio) for our mysql clusters. After testing different io-scheduler, we found that cfq is too slow and deadline can't run on cgroup. So we developed a new io-scheduler: tpps (Tiny Parallel Proportion Scheduler).It dispatch requests only by using their individual weight and total weight (proportion) therefore it's simply and efficient. Test case: fusionio card, 4 cgroups, iodepth-512 groupname weight test1 1000 test2 800 test3 600 test4 400 Use tpps, the result is: groupname iopsavg-rt(ms) max-rt(ms) test1 30220 16 54 test2 28261 18 56 test3 26333 19 69 test4 20152 25 87 Use cfq, the result is: groupname iopsavg-rt(ms) max-rt(ms) test1 16478 30 242 test2 13015 39 347 test3 9300 54 371 test4 5806 87 393 Signed-off-by: Robin Dong Signed-off-by: Zhu Yanhai Cc: Tejun Heo Cc: Vivek Goyal Cc: Jens Axboe Cc: Tao Ma --- block/Kconfig.iosched | 13 + block/Makefile |1 + block/tpps-iosched.c | 1272 include/linux/blkdev.h |2 +- 4 files changed, 1287 insertions(+), 1 deletions(-) create mode 100644 block/tpps-iosched.c diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 421bef9..e5e28c2 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -21,6 +21,16 @@ config IOSCHED_DEADLINE a new point in the service tree and doing a batch of IO from there in case of expiry. +config IOSCHED_TPPS + tristate "TPPS I/O scheduler" + # If BLK_CGROUP is a module, TPPS has to be built as module. + default y + ---help--- + The TPPS I/O scheduler tries to distribute iops proportional + among all cgroups in the system. It should also provide a low + latency working environment, suitable for flash-based device. + Note: If BLK_CGROUP=m, then TPPS can be built only as module. + config IOSCHED_CFQ tristate "CFQ I/O scheduler" default y @@ -49,6 +59,9 @@ choice config DEFAULT_DEADLINE bool "Deadline" if IOSCHED_DEADLINE=y + config DEFAULT_TPPS + bool "Tiny Parallel Proportion" if IOSCHED_TPPS=y + config DEFAULT_CFQ bool "CFQ" if IOSCHED_CFQ=y diff --git a/block/Makefile b/block/Makefile index 39b76ba..6e30ef4 100644 --- a/block/Makefile +++ b/block/Makefile @@ -15,6 +15,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o +obj-$(CONFIG_IOSCHED_TPPS) += tpps-iosched.o obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o obj-$(CONFIG_BLK_DEV_INTEGRITY)+= blk-integrity.o diff --git a/block/tpps-iosched.c b/block/tpps-iosched.c new file mode 100644 index 000..981fde2 --- /dev/null +++ b/block/tpps-iosched.c @@ -0,0 +1,1272 @@ +/* + * TPPS, or Tiny Parallel Proportion disk Scheduler. + * + * Based on ideas from Zhu Yanhai + * + * Copyright (C) 2013 Robin Dong + */ +#include +#include +#include +#include +#include +#include +#include +#include "blk-cgroup.h" +#include "blk.h" + +static struct kmem_cache *tpps_pool; + +struct tpps_queue { + /* reference count */ + int ref; + /* parent tpps_data */ + struct tpps_data *tppd; + /* tpps_group member */ + struct list_head tppg_node; + /* sorted list of pending requests */ + struct list_head sort_list; + struct tpps_group *tppg; + pid_t pid; + int online; + int rq_queued; +}; + +struct tppg_stats { + /* total bytes transferred */ + struct blkg_rwstat service_bytes; + /* total IOs serviced, post merge */ + struct blkg_rwstat serviced; + /* number of ios merged */ + struct blkg_rwstat merged; + /* total time spent on device in ns, may not be accurate w/ queueing */ + struct blkg_rwstat service_time; + /* total time spent waiting in scheduler queue in ns */ + struct blkg_rwstat wait_time; + /* number of IOs queued up */ + struct blkg_rwstat queued; + /* total sectors transferred */ + struct blkg_statsectors; + /* total disk time and nr sectors dispatched by this group */ + struct blkg_stattime; +}; + +struct tpps_group { + struct blkg_policy_data pd; + /* tpps_data member */ + struct list_head tppd_node; + struct list_head *cur_dispatcher; + + unsigned int weight; + unsigned int new_weight; + unsigned
[RFC v1] add new io-scheduler to use cgroup on high-speed device
From: Robin Dong san...@taobao.com We want to use blkio.cgroup on high-speed device (like fusionio) for our mysql clusters. After testing different io-scheduler, we found that cfq is too slow and deadline can't run on cgroup. So we developed a new io-scheduler: tpps (Tiny Parallel Proportion Scheduler).It dispatch requests only by using their individual weight and total weight (proportion) therefore it's simply and efficient. Test case: fusionio card, 4 cgroups, iodepth-512 groupname weight test1 1000 test2 800 test3 600 test4 400 Use tpps, the result is: groupname iopsavg-rt(ms) max-rt(ms) test1 30220 16 54 test2 28261 18 56 test3 26333 19 69 test4 20152 25 87 Use cfq, the result is: groupname iopsavg-rt(ms) max-rt(ms) test1 16478 30 242 test2 13015 39 347 test3 9300 54 371 test4 5806 87 393 Signed-off-by: Robin Dong san...@taobao.com Signed-off-by: Zhu Yanhai gaoyang@taobao.com Cc: Tejun Heo t...@kernel.org Cc: Vivek Goyal vgo...@redhat.com Cc: Jens Axboe ax...@kernel.dk Cc: Tao Ma taoma...@gmail.com --- block/Kconfig.iosched | 13 + block/Makefile |1 + block/tpps-iosched.c | 1272 include/linux/blkdev.h |2 +- 4 files changed, 1287 insertions(+), 1 deletions(-) create mode 100644 block/tpps-iosched.c diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 421bef9..e5e28c2 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -21,6 +21,16 @@ config IOSCHED_DEADLINE a new point in the service tree and doing a batch of IO from there in case of expiry. +config IOSCHED_TPPS + tristate TPPS I/O scheduler + # If BLK_CGROUP is a module, TPPS has to be built as module. + default y + ---help--- + The TPPS I/O scheduler tries to distribute iops proportional + among all cgroups in the system. It should also provide a low + latency working environment, suitable for flash-based device. + Note: If BLK_CGROUP=m, then TPPS can be built only as module. + config IOSCHED_CFQ tristate CFQ I/O scheduler default y @@ -49,6 +59,9 @@ choice config DEFAULT_DEADLINE bool Deadline if IOSCHED_DEADLINE=y + config DEFAULT_TPPS + bool Tiny Parallel Proportion if IOSCHED_TPPS=y + config DEFAULT_CFQ bool CFQ if IOSCHED_CFQ=y diff --git a/block/Makefile b/block/Makefile index 39b76ba..6e30ef4 100644 --- a/block/Makefile +++ b/block/Makefile @@ -15,6 +15,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o +obj-$(CONFIG_IOSCHED_TPPS) += tpps-iosched.o obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o obj-$(CONFIG_BLK_DEV_INTEGRITY)+= blk-integrity.o diff --git a/block/tpps-iosched.c b/block/tpps-iosched.c new file mode 100644 index 000..981fde2 --- /dev/null +++ b/block/tpps-iosched.c @@ -0,0 +1,1272 @@ +/* + * TPPS, or Tiny Parallel Proportion disk Scheduler. + * + * Based on ideas from Zhu Yanhai gaoyang@taobao.com + * + * Copyright (C) 2013 Robin Dong san...@taobao.com + */ +#include linux/module.h +#include linux/blkdev.h +#include linux/elevator.h +#include linux/jiffies.h +#include linux/rbtree.h +#include linux/ioprio.h +#include linux/blktrace_api.h +#include blk-cgroup.h +#include blk.h + +static struct kmem_cache *tpps_pool; + +struct tpps_queue { + /* reference count */ + int ref; + /* parent tpps_data */ + struct tpps_data *tppd; + /* tpps_group member */ + struct list_head tppg_node; + /* sorted list of pending requests */ + struct list_head sort_list; + struct tpps_group *tppg; + pid_t pid; + int online; + int rq_queued; +}; + +struct tppg_stats { + /* total bytes transferred */ + struct blkg_rwstat service_bytes; + /* total IOs serviced, post merge */ + struct blkg_rwstat serviced; + /* number of ios merged */ + struct blkg_rwstat merged; + /* total time spent on device in ns, may not be accurate w/ queueing */ + struct blkg_rwstat service_time; + /* total time spent waiting in scheduler queue in ns */ + struct blkg_rwstat wait_time; + /* number of IOs queued up */ + struct blkg_rwstat queued; + /* total sectors transferred */ + struct blkg_statsectors; + /* total disk time and nr sectors dispatched by this group */ + struct blkg_stattime; +}; + +struct tpps_group { + struct blkg_policy_data pd; + /* tpps_data
[RFC v1] add new io-scheduler to use cgroup on high-speed device
We want to use blkio.cgroup on high-speed device (like fusionio) for our mysql clusters. After testing different io-scheduler, we found that cfq is too slow and deadline can't run on cgroup. So we developed a new io-scheduler: tpps (Tiny Parallel Proportion Scheduler).It dispatch requests only by using their individual weight and total weight (proportion) therefore it's simply and efficient. Test case: fusionio card, 4 cgroups, iodepth-512 groupname weight test1 1000 test2 800 test3 600 test4 400 Use tpps, the result is: groupname iopsavg-rt(ms) max-rt(ms) test1 30220 16 54 test2 28261 18 56 test3 26333 19 69 test4 20152 25 87 Use cfq, the result is: groupname iopsavg-rt(ms) max-rt(ms) test1 16478 30 242 test2 13015 39 347 test3 9300 54 371 test4 5806 87 393 Signed-off-by: Robin Dong san...@taobao.com Signed-off-by: Zhu Yanhai gaoyang@taobao.com Cc: Tejun Heo t...@kernel.org Cc: Vivek Goyal vgo...@redhat.com Cc: Jens Axboe ax...@kernel.dk Cc: Tao Ma taoma...@gmail.com --- block/Kconfig.iosched | 13 + block/Makefile |1 + block/tpps-iosched.c | 1272 include/linux/blkdev.h |2 +- 4 files changed, 1287 insertions(+), 1 deletions(-) create mode 100644 block/tpps-iosched.c diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 421bef9..e5e28c2 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -21,6 +21,16 @@ config IOSCHED_DEADLINE a new point in the service tree and doing a batch of IO from there in case of expiry. +config IOSCHED_TPPS + tristate TPPS I/O scheduler + # If BLK_CGROUP is a module, TPPS has to be built as module. + default y + ---help--- + The TPPS I/O scheduler tries to distribute iops proportional + among all cgroups in the system. It should also provide a low + latency working environment, suitable for flash-based device. + Note: If BLK_CGROUP=m, then TPPS can be built only as module. + config IOSCHED_CFQ tristate CFQ I/O scheduler default y @@ -49,6 +59,9 @@ choice config DEFAULT_DEADLINE bool Deadline if IOSCHED_DEADLINE=y + config DEFAULT_TPPS + bool Tiny Parallel Proportion if IOSCHED_TPPS=y + config DEFAULT_CFQ bool CFQ if IOSCHED_CFQ=y diff --git a/block/Makefile b/block/Makefile index 39b76ba..6e30ef4 100644 --- a/block/Makefile +++ b/block/Makefile @@ -15,6 +15,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o +obj-$(CONFIG_IOSCHED_TPPS) += tpps-iosched.o obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o obj-$(CONFIG_BLK_DEV_INTEGRITY)+= blk-integrity.o diff --git a/block/tpps-iosched.c b/block/tpps-iosched.c new file mode 100644 index 000..981fde2 --- /dev/null +++ b/block/tpps-iosched.c @@ -0,0 +1,1272 @@ +/* + * TPPS, or Tiny Parallel Proportion disk Scheduler. + * + * Based on ideas from Zhu Yanhai gaoyang@taobao.com + * + * Copyright (C) 2013 Robin Dong san...@taobao.com + */ +#include linux/module.h +#include linux/blkdev.h +#include linux/elevator.h +#include linux/jiffies.h +#include linux/rbtree.h +#include linux/ioprio.h +#include linux/blktrace_api.h +#include blk-cgroup.h +#include blk.h + +static struct kmem_cache *tpps_pool; + +struct tpps_queue { + /* reference count */ + int ref; + /* parent tpps_data */ + struct tpps_data *tppd; + /* tpps_group member */ + struct list_head tppg_node; + /* sorted list of pending requests */ + struct list_head sort_list; + struct tpps_group *tppg; + pid_t pid; + int online; + int rq_queued; +}; + +struct tppg_stats { + /* total bytes transferred */ + struct blkg_rwstat service_bytes; + /* total IOs serviced, post merge */ + struct blkg_rwstat serviced; + /* number of ios merged */ + struct blkg_rwstat merged; + /* total time spent on device in ns, may not be accurate w/ queueing */ + struct blkg_rwstat service_time; + /* total time spent waiting in scheduler queue in ns */ + struct blkg_rwstat wait_time; + /* number of IOs queued up */ + struct blkg_rwstat queued; + /* total sectors transferred */ + struct blkg_statsectors; + /* total disk time and nr sectors dispatched by this group */ + struct blkg_stattime; +}; + +struct tpps_group { + struct blkg_policy_data pd; + /* tpps_data member */ + struct list_head
Re: [PATCH 2/2] md: modify dm_io() so it could return bios instead of submitting it
ping 2012/9/20 Robin Dong : > From: Robin Dong > > When trying to modify flashcache to request based (current it's bio based), > we need > to make request from bios by ourselves, but dm_io() will submit these bios > directly, > so we propose to modify the dm_io() to return bios instead of submiting it. > > This could also improve the flexibility of dm_io(). > > Signed-off-by: Robin Dong > --- > drivers/md/dm.c | 11 +++ > include/linux/device-mapper.h |3 +++ > 2 files changed, 14 insertions(+), 0 deletions(-) > > diff --git a/drivers/md/dm.c b/drivers/md/dm.c > index 4e09b6f..bf6e3bb 100644 > --- a/drivers/md/dm.c > +++ b/drivers/md/dm.c > @@ -1459,11 +1459,22 @@ static int dm_request_based(struct mapped_device *md) > static void dm_request(struct request_queue *q, struct bio *bio) > { > struct mapped_device *md = q->queuedata; > + struct dm_table *map = dm_get_live_table(md); > + struct dm_target *ti = dm_table_find_target(map, bio->bi_sector); > + > + if (ti->type->mk_rq) { > + ti->type->mk_rq(ti, q, bio); > + goto out; > + } > > if (dm_request_based(md)) > blk_queue_bio(q, bio); > else > _dm_request(q, bio); > + > +out: > + dm_table_put(map); > + return; > } > > void dm_dispatch_request(struct request *rq) > diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h > index 38d27a1..2386389 100644 > --- a/include/linux/device-mapper.h > +++ b/include/linux/device-mapper.h > @@ -50,6 +50,8 @@ typedef int (*dm_map_fn) (struct dm_target *ti, struct bio > *bio, > union map_info *map_context); > typedef int (*dm_map_request_fn) (struct dm_target *ti, struct request > *clone, > union map_info *map_context); > +typedef int (*dm_make_request_fn) (struct dm_target *ti, > + struct request_queue *q, struct bio *bio); > > /* > * Returns: > @@ -136,6 +138,7 @@ struct target_type { > dm_dtr_fn dtr; > dm_map_fn map; > dm_map_request_fn map_rq; > + dm_make_request_fn mk_rq; > dm_endio_fn end_io; > dm_request_endio_fn rq_end_io; > dm_presuspend_fn presuspend; > -- > 1.7.1 > -- -- Best Regard Robin Dong -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/2] md: add new interface 'mk_rq' in target_type
ping 2012/9/20 Robin Dong : > From: Robin Dong > > We are now trying to modify flashcache(https://github.com/facebook/flashcache) > to make it request based so that > we can let cfq io-controller control the bandwidth between different > io cgroups. > > A search in the dm directory tells me that only multipath is a request > based dm target and its functionality > is very simple and map_rq() is used to map the request to different > underlying devices. > We can't work in this way because: > > 1. the request which processed by map_rq() need to be issued to > different lower devices (disk device and cache device, in > flashcache), therefore the request > can't be totally remapped by simply changing its queue and returning > DM_MAPIO_REMAPPED in map_rq() like multipath_map() > 2. to submit bios drectly in map_rq() (by return DM_MAPIO_SUBMITTED) will > cause BUG_ON(!irqs_disabled()) > in dm_request_fn() because the > submit_bio()->generic_make_request()->blk_queue_bio() will definitly call > spin_unlock_irq to enable the irqs > > As above,the interface map_rq() provided by devcie-mapper framework > is not enough for an autonomous target, like flashcache. > > We propose to add a new > mk_rq interface so that we can make the requests > by ourselves. > > Signed-off-by: Robin Dong > --- > drivers/md/dm-io.c| 58 > drivers/md/dm-log.c |1 + > include/linux/dm-io.h |3 ++ > 3 files changed, 38 insertions(+), 24 deletions(-) > > diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c > index ea5dd28..f767792 100644 > --- a/drivers/md/dm-io.c > +++ b/drivers/md/dm-io.c > @@ -287,8 +287,8 @@ static void km_dp_init(struct dpages *dp, void *data) > /*- > * IO routines that accept a list of pages. > *---*/ > -static void do_region(int rw, unsigned region, struct dm_io_region *where, > - struct dpages *dp, struct io *io) > +static void do_region(struct dm_io_request *io_req, unsigned region, > + struct dm_io_region *where, struct dpages *dp, struct io *io) > { > struct bio *bio; > struct page *page; > @@ -298,6 +298,7 @@ static void do_region(int rw, unsigned region, struct > dm_io_region *where, > sector_t remaining = where->count; > struct request_queue *q = bdev_get_queue(where->bdev); > sector_t discard_sectors; > + int rw = io_req->bi_rw; > > /* > * where->count may be zero if rw holds a flush and we need to > @@ -339,15 +340,26 @@ static void do_region(int rw, unsigned region, struct > dm_io_region *where, > } > > atomic_inc(>count); > - submit_bio(rw, bio); > + if (!io_req->only_create_bio) > + submit_bio(rw, bio); > + else { > + bio->bi_rw |= rw; > + if (io_req->start) { > + io_req->end->bi_next = bio; > + io_req->end = bio; > + } else > + io_req->start = io_req->end = bio; > + bio->bi_next = NULL; > + } > } while (remaining); > } > > -static void dispatch_io(int rw, unsigned int num_regions, > +static void dispatch_io(struct dm_io_request *io_req, unsigned int > num_regions, > struct dm_io_region *where, struct dpages *dp, > struct io *io, int sync) > { > int i; > + int rw = io_req->bi_rw; > struct dpages old_pages = *dp; > > BUG_ON(num_regions > DM_IO_MAX_REGIONS); > @@ -362,7 +374,7 @@ static void dispatch_io(int rw, unsigned int num_regions, > for (i = 0; i < num_regions; i++) { > *dp = old_pages; > if (where[i].count || (rw & REQ_FLUSH)) > - do_region(rw, i, where + i, dp, io); > + do_region(io_req, i, where + i, dp, io); > } > > /* > @@ -372,8 +384,8 @@ static void dispatch_io(int rw, unsigned int num_regions, > dec_count(io, 0, 0); > } > > -static int sync_io(struct dm_io_client *client, unsigned int num_regions, > - struct dm_io_region *where, int rw, struct dpages *dp, > +static int sync_io(struct dm_io_request *io_req, unsigned int num_regions, > + struct dm_io_region *where, struc
Re: [PATCH 1/2] md: add new interface 'mk_rq' in target_type
ping 2012/9/20 Robin Dong robin.k.d...@gmail.com: From: Robin Dong san...@taobao.com We are now trying to modify flashcache(https://github.com/facebook/flashcache) to make it request based so that we can let cfq io-controller control the bandwidth between different io cgroups. A search in the dm directory tells me that only multipath is a request based dm target and its functionality is very simple and map_rq() is used to map the request to different underlying devices. We can't work in this way because: 1. the request which processed by map_rq() need to be issued to different lower devices (disk device and cache device, in flashcache), therefore the request can't be totally remapped by simply changing its queue and returning DM_MAPIO_REMAPPED in map_rq() like multipath_map() 2. to submit bios drectly in map_rq() (by return DM_MAPIO_SUBMITTED) will cause BUG_ON(!irqs_disabled()) in dm_request_fn() because the submit_bio()-generic_make_request()-blk_queue_bio() will definitly call spin_unlock_irq to enable the irqs As above,the interface map_rq() provided by devcie-mapper framework is not enough for an autonomous target, like flashcache. We propose to add a new mk_rq interface so that we can make the requests by ourselves. Signed-off-by: Robin Dong san...@taobao.com --- drivers/md/dm-io.c| 58 drivers/md/dm-log.c |1 + include/linux/dm-io.h |3 ++ 3 files changed, 38 insertions(+), 24 deletions(-) diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index ea5dd28..f767792 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -287,8 +287,8 @@ static void km_dp_init(struct dpages *dp, void *data) /*- * IO routines that accept a list of pages. *---*/ -static void do_region(int rw, unsigned region, struct dm_io_region *where, - struct dpages *dp, struct io *io) +static void do_region(struct dm_io_request *io_req, unsigned region, + struct dm_io_region *where, struct dpages *dp, struct io *io) { struct bio *bio; struct page *page; @@ -298,6 +298,7 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where, sector_t remaining = where-count; struct request_queue *q = bdev_get_queue(where-bdev); sector_t discard_sectors; + int rw = io_req-bi_rw; /* * where-count may be zero if rw holds a flush and we need to @@ -339,15 +340,26 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where, } atomic_inc(io-count); - submit_bio(rw, bio); + if (!io_req-only_create_bio) + submit_bio(rw, bio); + else { + bio-bi_rw |= rw; + if (io_req-start) { + io_req-end-bi_next = bio; + io_req-end = bio; + } else + io_req-start = io_req-end = bio; + bio-bi_next = NULL; + } } while (remaining); } -static void dispatch_io(int rw, unsigned int num_regions, +static void dispatch_io(struct dm_io_request *io_req, unsigned int num_regions, struct dm_io_region *where, struct dpages *dp, struct io *io, int sync) { int i; + int rw = io_req-bi_rw; struct dpages old_pages = *dp; BUG_ON(num_regions DM_IO_MAX_REGIONS); @@ -362,7 +374,7 @@ static void dispatch_io(int rw, unsigned int num_regions, for (i = 0; i num_regions; i++) { *dp = old_pages; if (where[i].count || (rw REQ_FLUSH)) - do_region(rw, i, where + i, dp, io); + do_region(io_req, i, where + i, dp, io); } /* @@ -372,8 +384,8 @@ static void dispatch_io(int rw, unsigned int num_regions, dec_count(io, 0, 0); } -static int sync_io(struct dm_io_client *client, unsigned int num_regions, - struct dm_io_region *where, int rw, struct dpages *dp, +static int sync_io(struct dm_io_request *io_req, unsigned int num_regions, + struct dm_io_region *where, struct dpages *dp, unsigned long *error_bits) { /* @@ -385,7 +397,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, volatile char io_[sizeof(struct io) + __alignof__(struct io) - 1]; struct io *io = (struct io *)PTR_ALIGN(io_, __alignof__(struct io)); - if (num_regions 1 (rw RW_MASK) != WRITE) { + if (num_regions 1 (io_req-bi_rw RW_MASK) != WRITE
Re: [PATCH 2/2] md: modify dm_io() so it could return bios instead of submitting it
ping 2012/9/20 Robin Dong robin.k.d...@gmail.com: From: Robin Dong san...@taobao.com When trying to modify flashcache to request based (current it's bio based), we need to make request from bios by ourselves, but dm_io() will submit these bios directly, so we propose to modify the dm_io() to return bios instead of submiting it. This could also improve the flexibility of dm_io(). Signed-off-by: Robin Dong san...@taobao.com --- drivers/md/dm.c | 11 +++ include/linux/device-mapper.h |3 +++ 2 files changed, 14 insertions(+), 0 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 4e09b6f..bf6e3bb 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1459,11 +1459,22 @@ static int dm_request_based(struct mapped_device *md) static void dm_request(struct request_queue *q, struct bio *bio) { struct mapped_device *md = q-queuedata; + struct dm_table *map = dm_get_live_table(md); + struct dm_target *ti = dm_table_find_target(map, bio-bi_sector); + + if (ti-type-mk_rq) { + ti-type-mk_rq(ti, q, bio); + goto out; + } if (dm_request_based(md)) blk_queue_bio(q, bio); else _dm_request(q, bio); + +out: + dm_table_put(map); + return; } void dm_dispatch_request(struct request *rq) diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 38d27a1..2386389 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -50,6 +50,8 @@ typedef int (*dm_map_fn) (struct dm_target *ti, struct bio *bio, union map_info *map_context); typedef int (*dm_map_request_fn) (struct dm_target *ti, struct request *clone, union map_info *map_context); +typedef int (*dm_make_request_fn) (struct dm_target *ti, + struct request_queue *q, struct bio *bio); /* * Returns: @@ -136,6 +138,7 @@ struct target_type { dm_dtr_fn dtr; dm_map_fn map; dm_map_request_fn map_rq; + dm_make_request_fn mk_rq; dm_endio_fn end_io; dm_request_endio_fn rq_end_io; dm_presuspend_fn presuspend; -- 1.7.1 -- -- Best Regard Robin Dong -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 2/2 v5] block/throttle: Add IO submitted information in blkio.throttle
From: Robin Dong Currently, if the IO is throttled by io-throttle, the system admin has no idea of the situation and can't report it to the real application user about that he/she has to do something. So this patch adds a new interface named blkio.throttle.io_submitted which exposes the number of bios that have been sent into blk-throttle therefore the user could calculate the difference from throttle.io_serviced to see how many IOs are currently throttled. Cc: Tejun Heo Cc: Vivek Goyal Cc: Jens Axboe Signed-off-by: Tao Ma Signed-off-by: Robin Dong --- v3 <-- v2: - Use nr-queued[] of struct throtl_grp for stats instaed of adding new blkg_rwstat. v4 <-- v3: - Add two new blkg_rwstat arguments to count total bios be sent into blk_throttle. v5 <-- v4: - Change name "io_submit_bytes" to "io_submitted_bytes". block/blk-throttle.c | 43 +++ 1 files changed, 43 insertions(+), 0 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 46ddeff..c6391b5 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -46,6 +46,10 @@ struct tg_stats_cpu { struct blkg_rwstat service_bytes; /* total IOs serviced, post merge */ struct blkg_rwstat serviced; + /* total bytes submitted into blk-throttle */ + struct blkg_rwstat submit_bytes; + /* total IOs submitted into blk-throttle */ + struct blkg_rwstat submitted; }; struct throtl_grp { @@ -266,6 +270,8 @@ static void throtl_pd_reset_stats(struct blkcg_gq *blkg) blkg_rwstat_reset(>service_bytes); blkg_rwstat_reset(>serviced); + blkg_rwstat_reset(>submit_bytes); + blkg_rwstat_reset(>submitted); } } @@ -699,6 +705,30 @@ static void throtl_update_dispatch_stats(struct throtl_grp *tg, u64 bytes, local_irq_restore(flags); } +static void throtl_update_submit_stats(struct throtl_grp *tg, u64 bytes, int rw) +{ + struct tg_stats_cpu *stats_cpu; + unsigned long flags; + + /* If per cpu stats are not allocated yet, don't do any accounting. */ + if (tg->stats_cpu == NULL) + return; + + /* +* Disabling interrupts to provide mutual exclusion between two +* writes on same cpu. It probably is not needed for 64bit. Not +* optimizing that case yet. +*/ + local_irq_save(flags); + + stats_cpu = this_cpu_ptr(tg->stats_cpu); + + blkg_rwstat_add(_cpu->submitted, rw, 1); + blkg_rwstat_add(_cpu->submit_bytes, rw, bytes); + + local_irq_restore(flags); +} + static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) { bool rw = bio_data_dir(bio); @@ -1084,6 +1114,16 @@ static struct cftype throtl_files[] = { .private = offsetof(struct tg_stats_cpu, serviced), .read_seq_string = tg_print_cpu_rwstat, }, + { + .name = "throttle.io_submitted_bytes", + .private = offsetof(struct tg_stats_cpu, submit_bytes), + .read_seq_string = tg_print_cpu_rwstat, + }, + { + .name = "throttle.io_submitted", + .private = offsetof(struct tg_stats_cpu, submitted), + .read_seq_string = tg_print_cpu_rwstat, + }, { } /* terminate */ }; @@ -1128,6 +1168,8 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) if (tg_no_rule_group(tg, rw)) { throtl_update_dispatch_stats(tg, bio->bi_size, bio->bi_rw); + throtl_update_submit_stats(tg, + bio->bi_size, bio->bi_rw); goto out_unlock_rcu; } } @@ -1141,6 +1183,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) if (unlikely(!tg)) goto out_unlock; + throtl_update_submit_stats(tg, bio->bi_size, bio->bi_rw); if (tg->nr_queued[rw]) { /* * There is already another bio queued in same dir. No -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 1/2 v5] block/throttle: remove redundant type transition
From: Robin Dong We don't need to convert tg to blkg and then convert it back in throtl_update_dispatch_stats(). Signed-off-by: Robin Dong --- block/blk-throttle.c |7 +++ 1 files changed, 3 insertions(+), 4 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index a9664fa..46ddeff 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -674,10 +674,9 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, return 0; } -static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes, +static void throtl_update_dispatch_stats(struct throtl_grp *tg, u64 bytes, int rw) { - struct throtl_grp *tg = blkg_to_tg(blkg); struct tg_stats_cpu *stats_cpu; unsigned long flags; @@ -708,7 +707,7 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) tg->bytes_disp[rw] += bio->bi_size; tg->io_disp[rw]++; - throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, bio->bi_rw); + throtl_update_dispatch_stats(tg, bio->bi_size, bio->bi_rw); } static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, @@ -1127,7 +1126,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) tg = throtl_lookup_tg(td, blkcg); if (tg) { if (tg_no_rule_group(tg, rw)) { - throtl_update_dispatch_stats(tg_to_blkg(tg), + throtl_update_dispatch_stats(tg, bio->bi_size, bio->bi_rw); goto out_unlock_rcu; } -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 1/2 v5] block/throttle: remove redundant type transition
From: Robin Dong san...@taobao.com We don't need to convert tg to blkg and then convert it back in throtl_update_dispatch_stats(). Signed-off-by: Robin Dong san...@taobao.com --- block/blk-throttle.c |7 +++ 1 files changed, 3 insertions(+), 4 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index a9664fa..46ddeff 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -674,10 +674,9 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, return 0; } -static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes, +static void throtl_update_dispatch_stats(struct throtl_grp *tg, u64 bytes, int rw) { - struct throtl_grp *tg = blkg_to_tg(blkg); struct tg_stats_cpu *stats_cpu; unsigned long flags; @@ -708,7 +707,7 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) tg-bytes_disp[rw] += bio-bi_size; tg-io_disp[rw]++; - throtl_update_dispatch_stats(tg_to_blkg(tg), bio-bi_size, bio-bi_rw); + throtl_update_dispatch_stats(tg, bio-bi_size, bio-bi_rw); } static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, @@ -1127,7 +1126,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) tg = throtl_lookup_tg(td, blkcg); if (tg) { if (tg_no_rule_group(tg, rw)) { - throtl_update_dispatch_stats(tg_to_blkg(tg), + throtl_update_dispatch_stats(tg, bio-bi_size, bio-bi_rw); goto out_unlock_rcu; } -- 1.7.1 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 2/2 v5] block/throttle: Add IO submitted information in blkio.throttle
From: Robin Dong san...@taobao.com Currently, if the IO is throttled by io-throttle, the system admin has no idea of the situation and can't report it to the real application user about that he/she has to do something. So this patch adds a new interface named blkio.throttle.io_submitted which exposes the number of bios that have been sent into blk-throttle therefore the user could calculate the difference from throttle.io_serviced to see how many IOs are currently throttled. Cc: Tejun Heo t...@kernel.org Cc: Vivek Goyal vgo...@redhat.com Cc: Jens Axboe ax...@kernel.dk Signed-off-by: Tao Ma boyu...@taobao.com Signed-off-by: Robin Dong san...@taobao.com --- v3 -- v2: - Use nr-queued[] of struct throtl_grp for stats instaed of adding new blkg_rwstat. v4 -- v3: - Add two new blkg_rwstat arguments to count total bios be sent into blk_throttle. v5 -- v4: - Change name io_submit_bytes to io_submitted_bytes. block/blk-throttle.c | 43 +++ 1 files changed, 43 insertions(+), 0 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 46ddeff..c6391b5 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -46,6 +46,10 @@ struct tg_stats_cpu { struct blkg_rwstat service_bytes; /* total IOs serviced, post merge */ struct blkg_rwstat serviced; + /* total bytes submitted into blk-throttle */ + struct blkg_rwstat submit_bytes; + /* total IOs submitted into blk-throttle */ + struct blkg_rwstat submitted; }; struct throtl_grp { @@ -266,6 +270,8 @@ static void throtl_pd_reset_stats(struct blkcg_gq *blkg) blkg_rwstat_reset(sc-service_bytes); blkg_rwstat_reset(sc-serviced); + blkg_rwstat_reset(sc-submit_bytes); + blkg_rwstat_reset(sc-submitted); } } @@ -699,6 +705,30 @@ static void throtl_update_dispatch_stats(struct throtl_grp *tg, u64 bytes, local_irq_restore(flags); } +static void throtl_update_submit_stats(struct throtl_grp *tg, u64 bytes, int rw) +{ + struct tg_stats_cpu *stats_cpu; + unsigned long flags; + + /* If per cpu stats are not allocated yet, don't do any accounting. */ + if (tg-stats_cpu == NULL) + return; + + /* +* Disabling interrupts to provide mutual exclusion between two +* writes on same cpu. It probably is not needed for 64bit. Not +* optimizing that case yet. +*/ + local_irq_save(flags); + + stats_cpu = this_cpu_ptr(tg-stats_cpu); + + blkg_rwstat_add(stats_cpu-submitted, rw, 1); + blkg_rwstat_add(stats_cpu-submit_bytes, rw, bytes); + + local_irq_restore(flags); +} + static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) { bool rw = bio_data_dir(bio); @@ -1084,6 +1114,16 @@ static struct cftype throtl_files[] = { .private = offsetof(struct tg_stats_cpu, serviced), .read_seq_string = tg_print_cpu_rwstat, }, + { + .name = throttle.io_submitted_bytes, + .private = offsetof(struct tg_stats_cpu, submit_bytes), + .read_seq_string = tg_print_cpu_rwstat, + }, + { + .name = throttle.io_submitted, + .private = offsetof(struct tg_stats_cpu, submitted), + .read_seq_string = tg_print_cpu_rwstat, + }, { } /* terminate */ }; @@ -1128,6 +1168,8 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) if (tg_no_rule_group(tg, rw)) { throtl_update_dispatch_stats(tg, bio-bi_size, bio-bi_rw); + throtl_update_submit_stats(tg, + bio-bi_size, bio-bi_rw); goto out_unlock_rcu; } } @@ -1141,6 +1183,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) if (unlikely(!tg)) goto out_unlock; + throtl_update_submit_stats(tg, bio-bi_size, bio-bi_rw); if (tg-nr_queued[rw]) { /* * There is already another bio queued in same dir. No -- 1.7.1 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 2/2 v4] block/throttle: Add IO submitted information in blkio.throttle
2012/10/27 Vivek Goyal : > On Fri, Oct 26, 2012 at 12:47:48PM +0800, Robin Dong wrote: > > [..] >> @@ -1084,6 +1114,16 @@ static struct cftype throtl_files[] = { >> .private = offsetof(struct tg_stats_cpu, serviced), >> .read_seq_string = tg_print_cpu_rwstat, >> }, >> + { >> + .name = "throttle.io_submit_bytes", > > Do we really need io_submit_bytes stats? Your need seems to be able to > figure out if there are pending IOs in the group and if you need to > increase the bandwidth. For that, isn't number of bios enough? I just want to be more consistent to "io_service_bytes" and "io_serviced" pair > > Also even if we retain bytes, let us change it to "io_submitted_bytes". Since "io_service_bytes" is "service", I use "io_submit_bytes" Never mind, "io_submitted_bytes" is much better. > > Thanks > Vivek -- -- Best Regard Robin Dong -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 2/2 v4] block/throttle: Add IO submitted information in blkio.throttle
2012/10/27 Vivek Goyal vgo...@redhat.com: On Fri, Oct 26, 2012 at 12:47:48PM +0800, Robin Dong wrote: [..] @@ -1084,6 +1114,16 @@ static struct cftype throtl_files[] = { .private = offsetof(struct tg_stats_cpu, serviced), .read_seq_string = tg_print_cpu_rwstat, }, + { + .name = throttle.io_submit_bytes, Do we really need io_submit_bytes stats? Your need seems to be able to figure out if there are pending IOs in the group and if you need to increase the bandwidth. For that, isn't number of bios enough? I just want to be more consistent to io_service_bytes and io_serviced pair Also even if we retain bytes, let us change it to io_submitted_bytes. Since io_service_bytes is service, I use io_submit_bytes Never mind, io_submitted_bytes is much better. Thanks Vivek -- -- Best Regard Robin Dong -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 2/2 v4] block/throttle: Add IO submitted information in blkio.throttle
From: Robin Dong Currently, if the IO is throttled by io-throttle, the system admin has no idea of the situation and can't report it to the real application user about that he/she has to do something. So this patch adds a new interface named blkio.throttle.io_submitted which exposes the number of bios that have been sent into blk-throttle therefore the user could calculate the difference from throttle.io_serviced to see how many IOs are currently throttled. Cc: Tejun Heo Cc: Vivek Goyal Cc: Jens Axboe Signed-off-by: Tao Ma Signed-off-by: Robin Dong --- v3 <-- v2: - Use nr-queued[] of struct throtl_grp for stats instaed of adding new blkg_rwstat. v4 <-- v3: - Add two new blkg_rwstat arguments to count total bios be sent into blk_throttle. block/blk-throttle.c | 43 +++ 1 files changed, 43 insertions(+), 0 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 46ddeff..c6391b5 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -46,6 +46,10 @@ struct tg_stats_cpu { struct blkg_rwstat service_bytes; /* total IOs serviced, post merge */ struct blkg_rwstat serviced; + /* total bytes submitted into blk-throttle */ + struct blkg_rwstat submit_bytes; + /* total IOs submitted into blk-throttle */ + struct blkg_rwstat submitted; }; struct throtl_grp { @@ -266,6 +270,8 @@ static void throtl_pd_reset_stats(struct blkcg_gq *blkg) blkg_rwstat_reset(>service_bytes); blkg_rwstat_reset(>serviced); + blkg_rwstat_reset(>submit_bytes); + blkg_rwstat_reset(>submitted); } } @@ -699,6 +705,30 @@ static void throtl_update_dispatch_stats(struct throtl_grp *tg, u64 bytes, local_irq_restore(flags); } +static void throtl_update_submit_stats(struct throtl_grp *tg, u64 bytes, int rw) +{ + struct tg_stats_cpu *stats_cpu; + unsigned long flags; + + /* If per cpu stats are not allocated yet, don't do any accounting. */ + if (tg->stats_cpu == NULL) + return; + + /* +* Disabling interrupts to provide mutual exclusion between two +* writes on same cpu. It probably is not needed for 64bit. Not +* optimizing that case yet. +*/ + local_irq_save(flags); + + stats_cpu = this_cpu_ptr(tg->stats_cpu); + + blkg_rwstat_add(_cpu->submitted, rw, 1); + blkg_rwstat_add(_cpu->submit_bytes, rw, bytes); + + local_irq_restore(flags); +} + static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) { bool rw = bio_data_dir(bio); @@ -1084,6 +1114,16 @@ static struct cftype throtl_files[] = { .private = offsetof(struct tg_stats_cpu, serviced), .read_seq_string = tg_print_cpu_rwstat, }, + { + .name = "throttle.io_submit_bytes", + .private = offsetof(struct tg_stats_cpu, submit_bytes), + .read_seq_string = tg_print_cpu_rwstat, + }, + { + .name = "throttle.io_submitted", + .private = offsetof(struct tg_stats_cpu, submitted), + .read_seq_string = tg_print_cpu_rwstat, + }, { } /* terminate */ }; @@ -1128,6 +1168,8 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) if (tg_no_rule_group(tg, rw)) { throtl_update_dispatch_stats(tg, bio->bi_size, bio->bi_rw); + throtl_update_submit_stats(tg, + bio->bi_size, bio->bi_rw); goto out_unlock_rcu; } } @@ -1141,6 +1183,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) if (unlikely(!tg)) goto out_unlock; + throtl_update_submit_stats(tg, bio->bi_size, bio->bi_rw); if (tg->nr_queued[rw]) { /* * There is already another bio queued in same dir. No -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 1/2 v4] block/throttle: remove redundant type transition
From: Robin Dong We don't need to convert tg to blkg and then convert it back in throtl_update_dispatch_stats(). Signed-off-by: Robin Dong --- block/blk-throttle.c |7 +++ 1 files changed, 3 insertions(+), 4 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index a9664fa..46ddeff 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -674,10 +674,9 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, return 0; } -static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes, +static void throtl_update_dispatch_stats(struct throtl_grp *tg, u64 bytes, int rw) { - struct throtl_grp *tg = blkg_to_tg(blkg); struct tg_stats_cpu *stats_cpu; unsigned long flags; @@ -708,7 +707,7 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) tg->bytes_disp[rw] += bio->bi_size; tg->io_disp[rw]++; - throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, bio->bi_rw); + throtl_update_dispatch_stats(tg, bio->bi_size, bio->bi_rw); } static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, @@ -1127,7 +1126,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) tg = throtl_lookup_tg(td, blkcg); if (tg) { if (tg_no_rule_group(tg, rw)) { - throtl_update_dispatch_stats(tg_to_blkg(tg), + throtl_update_dispatch_stats(tg, bio->bi_size, bio->bi_rw); goto out_unlock_rcu; } -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 1/2 v4] block/throttle: remove redundant type transition
From: Robin Dong san...@taobao.com We don't need to convert tg to blkg and then convert it back in throtl_update_dispatch_stats(). Signed-off-by: Robin Dong san...@taobao.com --- block/blk-throttle.c |7 +++ 1 files changed, 3 insertions(+), 4 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index a9664fa..46ddeff 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -674,10 +674,9 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, return 0; } -static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes, +static void throtl_update_dispatch_stats(struct throtl_grp *tg, u64 bytes, int rw) { - struct throtl_grp *tg = blkg_to_tg(blkg); struct tg_stats_cpu *stats_cpu; unsigned long flags; @@ -708,7 +707,7 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) tg-bytes_disp[rw] += bio-bi_size; tg-io_disp[rw]++; - throtl_update_dispatch_stats(tg_to_blkg(tg), bio-bi_size, bio-bi_rw); + throtl_update_dispatch_stats(tg, bio-bi_size, bio-bi_rw); } static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, @@ -1127,7 +1126,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) tg = throtl_lookup_tg(td, blkcg); if (tg) { if (tg_no_rule_group(tg, rw)) { - throtl_update_dispatch_stats(tg_to_blkg(tg), + throtl_update_dispatch_stats(tg, bio-bi_size, bio-bi_rw); goto out_unlock_rcu; } -- 1.7.1 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 2/2 v4] block/throttle: Add IO submitted information in blkio.throttle
From: Robin Dong san...@taobao.com Currently, if the IO is throttled by io-throttle, the system admin has no idea of the situation and can't report it to the real application user about that he/she has to do something. So this patch adds a new interface named blkio.throttle.io_submitted which exposes the number of bios that have been sent into blk-throttle therefore the user could calculate the difference from throttle.io_serviced to see how many IOs are currently throttled. Cc: Tejun Heo t...@kernel.org Cc: Vivek Goyal vgo...@redhat.com Cc: Jens Axboe ax...@kernel.dk Signed-off-by: Tao Ma boyu...@taobao.com Signed-off-by: Robin Dong san...@taobao.com --- v3 -- v2: - Use nr-queued[] of struct throtl_grp for stats instaed of adding new blkg_rwstat. v4 -- v3: - Add two new blkg_rwstat arguments to count total bios be sent into blk_throttle. block/blk-throttle.c | 43 +++ 1 files changed, 43 insertions(+), 0 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 46ddeff..c6391b5 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -46,6 +46,10 @@ struct tg_stats_cpu { struct blkg_rwstat service_bytes; /* total IOs serviced, post merge */ struct blkg_rwstat serviced; + /* total bytes submitted into blk-throttle */ + struct blkg_rwstat submit_bytes; + /* total IOs submitted into blk-throttle */ + struct blkg_rwstat submitted; }; struct throtl_grp { @@ -266,6 +270,8 @@ static void throtl_pd_reset_stats(struct blkcg_gq *blkg) blkg_rwstat_reset(sc-service_bytes); blkg_rwstat_reset(sc-serviced); + blkg_rwstat_reset(sc-submit_bytes); + blkg_rwstat_reset(sc-submitted); } } @@ -699,6 +705,30 @@ static void throtl_update_dispatch_stats(struct throtl_grp *tg, u64 bytes, local_irq_restore(flags); } +static void throtl_update_submit_stats(struct throtl_grp *tg, u64 bytes, int rw) +{ + struct tg_stats_cpu *stats_cpu; + unsigned long flags; + + /* If per cpu stats are not allocated yet, don't do any accounting. */ + if (tg-stats_cpu == NULL) + return; + + /* +* Disabling interrupts to provide mutual exclusion between two +* writes on same cpu. It probably is not needed for 64bit. Not +* optimizing that case yet. +*/ + local_irq_save(flags); + + stats_cpu = this_cpu_ptr(tg-stats_cpu); + + blkg_rwstat_add(stats_cpu-submitted, rw, 1); + blkg_rwstat_add(stats_cpu-submit_bytes, rw, bytes); + + local_irq_restore(flags); +} + static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) { bool rw = bio_data_dir(bio); @@ -1084,6 +1114,16 @@ static struct cftype throtl_files[] = { .private = offsetof(struct tg_stats_cpu, serviced), .read_seq_string = tg_print_cpu_rwstat, }, + { + .name = throttle.io_submit_bytes, + .private = offsetof(struct tg_stats_cpu, submit_bytes), + .read_seq_string = tg_print_cpu_rwstat, + }, + { + .name = throttle.io_submitted, + .private = offsetof(struct tg_stats_cpu, submitted), + .read_seq_string = tg_print_cpu_rwstat, + }, { } /* terminate */ }; @@ -1128,6 +1168,8 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) if (tg_no_rule_group(tg, rw)) { throtl_update_dispatch_stats(tg, bio-bi_size, bio-bi_rw); + throtl_update_submit_stats(tg, + bio-bi_size, bio-bi_rw); goto out_unlock_rcu; } } @@ -1141,6 +1183,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) if (unlikely(!tg)) goto out_unlock; + throtl_update_submit_stats(tg, bio-bi_size, bio-bi_rw); if (tg-nr_queued[rw]) { /* * There is already another bio queued in same dir. No -- 1.7.1 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 2/2 v4] block/throttle: Add IO queued information in blkio.throttle
From: Robin Dong Currently, if the IO is throttled by io-throttle, the system admin has no idea of the situation and can't report it to the real application user about that he/she has to do something. So this patch adds a new interface named blkio.throttle.io_queued which exposes the number of bios that have been sent to blk-throttle therefore the user could calculate the difference from throttle.io_serviced to see how many IOs are currently throttled. Cc: Tejun Heo Cc: Vivek Goyal Cc: Jens Axboe Signed-off-by: Tao Ma Signed-off-by: Robin Dong --- v3 <-- v2: - Use nr-queued[] of struct throtl_grp for stats instaed of adding new blkg_rwstat. v4 <-- v3: - Add two new blkg_rwstat arguments to count total bios be sent in blk_throttle. block/blk-throttle.c | 44 1 files changed, 44 insertions(+), 0 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 46ddeff..b122b0c 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -70,6 +70,10 @@ struct throtl_grp { /* Number of queued bios on READ and WRITE lists */ unsigned int nr_queued[2]; + /* The stats of total number queued in blk-throtlle */ + struct blkg_rwstat io_queue_bytes; + struct blkg_rwstat io_queued; + /* bytes per second rate limits */ uint64_t bps[2]; @@ -267,6 +271,8 @@ static void throtl_pd_reset_stats(struct blkcg_gq *blkg) blkg_rwstat_reset(>service_bytes); blkg_rwstat_reset(>serviced); } + blkg_rwstat_reset(>io_queued); + blkg_rwstat_reset(>io_queue_bytes); } static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td, @@ -699,6 +705,12 @@ static void throtl_update_dispatch_stats(struct throtl_grp *tg, u64 bytes, local_irq_restore(flags); } +static void throtl_update_queued_stats(struct throtl_grp *tg, u64 bytes, int rw) +{ + blkg_rwstat_add(>io_queued, rw, 1); + blkg_rwstat_add(>io_queue_bytes, rw, bytes); +} + static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) { bool rw = bio_data_dir(bio); @@ -952,6 +964,15 @@ static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, return __blkg_prfill_rwstat(sf, pd, ); } +static u64 tg_prfill_io_queued(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + struct throtl_grp *tg = pd_to_tg(pd); + struct blkg_rwstat *rwstat = (void *)tg + off; + + return __blkg_prfill_rwstat(sf, pd, rwstat); +} + static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft, struct seq_file *sf) { @@ -962,6 +983,16 @@ static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft, return 0; } +static int tg_print_io_queued(struct cgroup *cgrp, struct cftype *cft, + struct seq_file *sf) +{ + struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + + blkcg_print_blkgs(sf, blkcg, tg_prfill_io_queued, _policy_throtl, + cft->private, true); + return 0; +} + static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd, int off) { @@ -1084,6 +1115,16 @@ static struct cftype throtl_files[] = { .private = offsetof(struct tg_stats_cpu, serviced), .read_seq_string = tg_print_cpu_rwstat, }, + { + .name = "throttle.io_queue_bytes", + .private = offsetof(struct throtl_grp, io_queue_bytes), + .read_seq_string = tg_print_io_queued, + }, + { + .name = "throttle.io_queued", + .private = offsetof(struct throtl_grp, io_queued), + .read_seq_string = tg_print_io_queued, + }, { } /* terminate */ }; @@ -1128,6 +1169,8 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) if (tg_no_rule_group(tg, rw)) { throtl_update_dispatch_stats(tg, bio->bi_size, bio->bi_rw); + throtl_update_queued_stats(tg, + bio->bi_size, bio->bi_rw); goto out_unlock_rcu; } } @@ -1141,6 +1184,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) if (unlikely(!tg)) goto out_unlock; + throtl_update_queued_stats(tg, bio->bi_size, bio->bi_rw); if (tg->nr_queued[rw]) { /* * There is already another bio queued in same dir. No -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 1/2 v4] block/throttle: remove redundant type transition
From: Robin Dong We don't need to convert tg to blkg and then convert it back in throtl_update_dispatch_stats(). Signed-off-by: Robin Dong --- block/blk-throttle.c |7 +++ 1 files changed, 3 insertions(+), 4 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index a9664fa..46ddeff 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -674,10 +674,9 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, return 0; } -static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes, +static void throtl_update_dispatch_stats(struct throtl_grp *tg, u64 bytes, int rw) { - struct throtl_grp *tg = blkg_to_tg(blkg); struct tg_stats_cpu *stats_cpu; unsigned long flags; @@ -708,7 +707,7 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) tg->bytes_disp[rw] += bio->bi_size; tg->io_disp[rw]++; - throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, bio->bi_rw); + throtl_update_dispatch_stats(tg, bio->bi_size, bio->bi_rw); } static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, @@ -1127,7 +1126,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) tg = throtl_lookup_tg(td, blkcg); if (tg) { if (tg_no_rule_group(tg, rw)) { - throtl_update_dispatch_stats(tg_to_blkg(tg), + throtl_update_dispatch_stats(tg, bio->bi_size, bio->bi_rw); goto out_unlock_rcu; } -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 1/2 v4] block/throttle: remove redundant type transition
From: Robin Dong san...@taobao.com We don't need to convert tg to blkg and then convert it back in throtl_update_dispatch_stats(). Signed-off-by: Robin Dong san...@taobao.com --- block/blk-throttle.c |7 +++ 1 files changed, 3 insertions(+), 4 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index a9664fa..46ddeff 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -674,10 +674,9 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, return 0; } -static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes, +static void throtl_update_dispatch_stats(struct throtl_grp *tg, u64 bytes, int rw) { - struct throtl_grp *tg = blkg_to_tg(blkg); struct tg_stats_cpu *stats_cpu; unsigned long flags; @@ -708,7 +707,7 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) tg-bytes_disp[rw] += bio-bi_size; tg-io_disp[rw]++; - throtl_update_dispatch_stats(tg_to_blkg(tg), bio-bi_size, bio-bi_rw); + throtl_update_dispatch_stats(tg, bio-bi_size, bio-bi_rw); } static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, @@ -1127,7 +1126,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) tg = throtl_lookup_tg(td, blkcg); if (tg) { if (tg_no_rule_group(tg, rw)) { - throtl_update_dispatch_stats(tg_to_blkg(tg), + throtl_update_dispatch_stats(tg, bio-bi_size, bio-bi_rw); goto out_unlock_rcu; } -- 1.7.1 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 2/2 v4] block/throttle: Add IO queued information in blkio.throttle
From: Robin Dong san...@taobao.com Currently, if the IO is throttled by io-throttle, the system admin has no idea of the situation and can't report it to the real application user about that he/she has to do something. So this patch adds a new interface named blkio.throttle.io_queued which exposes the number of bios that have been sent to blk-throttle therefore the user could calculate the difference from throttle.io_serviced to see how many IOs are currently throttled. Cc: Tejun Heo t...@kernel.org Cc: Vivek Goyal vgo...@redhat.com Cc: Jens Axboe ax...@kernel.dk Signed-off-by: Tao Ma boyu...@taobao.com Signed-off-by: Robin Dong san...@taobao.com --- v3 -- v2: - Use nr-queued[] of struct throtl_grp for stats instaed of adding new blkg_rwstat. v4 -- v3: - Add two new blkg_rwstat arguments to count total bios be sent in blk_throttle. block/blk-throttle.c | 44 1 files changed, 44 insertions(+), 0 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 46ddeff..b122b0c 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -70,6 +70,10 @@ struct throtl_grp { /* Number of queued bios on READ and WRITE lists */ unsigned int nr_queued[2]; + /* The stats of total number queued in blk-throtlle */ + struct blkg_rwstat io_queue_bytes; + struct blkg_rwstat io_queued; + /* bytes per second rate limits */ uint64_t bps[2]; @@ -267,6 +271,8 @@ static void throtl_pd_reset_stats(struct blkcg_gq *blkg) blkg_rwstat_reset(sc-service_bytes); blkg_rwstat_reset(sc-serviced); } + blkg_rwstat_reset(tg-io_queued); + blkg_rwstat_reset(tg-io_queue_bytes); } static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td, @@ -699,6 +705,12 @@ static void throtl_update_dispatch_stats(struct throtl_grp *tg, u64 bytes, local_irq_restore(flags); } +static void throtl_update_queued_stats(struct throtl_grp *tg, u64 bytes, int rw) +{ + blkg_rwstat_add(tg-io_queued, rw, 1); + blkg_rwstat_add(tg-io_queue_bytes, rw, bytes); +} + static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) { bool rw = bio_data_dir(bio); @@ -952,6 +964,15 @@ static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, return __blkg_prfill_rwstat(sf, pd, rwstat); } +static u64 tg_prfill_io_queued(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + struct throtl_grp *tg = pd_to_tg(pd); + struct blkg_rwstat *rwstat = (void *)tg + off; + + return __blkg_prfill_rwstat(sf, pd, rwstat); +} + static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft, struct seq_file *sf) { @@ -962,6 +983,16 @@ static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft, return 0; } +static int tg_print_io_queued(struct cgroup *cgrp, struct cftype *cft, + struct seq_file *sf) +{ + struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + + blkcg_print_blkgs(sf, blkcg, tg_prfill_io_queued, blkcg_policy_throtl, + cft-private, true); + return 0; +} + static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd, int off) { @@ -1084,6 +1115,16 @@ static struct cftype throtl_files[] = { .private = offsetof(struct tg_stats_cpu, serviced), .read_seq_string = tg_print_cpu_rwstat, }, + { + .name = throttle.io_queue_bytes, + .private = offsetof(struct throtl_grp, io_queue_bytes), + .read_seq_string = tg_print_io_queued, + }, + { + .name = throttle.io_queued, + .private = offsetof(struct throtl_grp, io_queued), + .read_seq_string = tg_print_io_queued, + }, { } /* terminate */ }; @@ -1128,6 +1169,8 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) if (tg_no_rule_group(tg, rw)) { throtl_update_dispatch_stats(tg, bio-bi_size, bio-bi_rw); + throtl_update_queued_stats(tg, + bio-bi_size, bio-bi_rw); goto out_unlock_rcu; } } @@ -1141,6 +1184,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) if (unlikely(!tg)) goto out_unlock; + throtl_update_queued_stats(tg, bio-bi_size, bio-bi_rw); if (tg-nr_queued[rw]) { /* * There is already another bio queued in same dir. No -- 1.7.1 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH V3] block/throttle: Add IO throttled information in blkio.throttle
From: Robin Dong Currently, if the IO is throttled by io-throttle, the SA has no idea of the situation and can't report it to the real application user about that he/she has to do something. So this patch adds a new interface named blkio.throttle.io_queued which indicates how many IOs are currently throttled. The nr_queued[] of struct throtl_grp is of type "unsigned int" and updates to it are atomic both at 32bit and 64bit platforms, so we could just read tg->nr_queued only under blkcg->lock. Changelog from v2: Use nr-queued[] of struct throtl_grp for stats instaed of adding new blkg_rwstat. Cc: Tejun Heo Cc: Vivek Goyal Cc: Jens Axboe Signed-off-by: Tao Ma Signed-off-by: Robin Dong --- block/blk-throttle.c | 40 1 files changed, 40 insertions(+), 0 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index a9664fa..e410448 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -953,6 +953,32 @@ static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, return __blkg_prfill_rwstat(sf, pd, ); } +static u64 tg_prfill_io_queued(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + static const char *rwstr[] = { + [READ] = "Read", + [WRITE] = "Write", + }; + struct throtl_grp *tg = pd_to_tg(pd); + const char *dname = NULL; + unsigned int v; + int i; + + if (pd->blkg->q->backing_dev_info.dev) + dname = dev_name(pd->blkg->q->backing_dev_info.dev); + + if (!dname) + return 0; + + for (i = 0; i <= WRITE; i++) + seq_printf(sf, "%s %s %u\n", dname, rwstr[i], tg->nr_queued[i]); + + v = tg->nr_queued[READ] + tg->nr_queued[WRITE]; + seq_printf(sf, "%s Total %u\n", dname, v); + return v; +} + static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft, struct seq_file *sf) { @@ -963,6 +989,16 @@ static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft, return 0; } +static int tg_print_io_queued(struct cgroup *cgrp, struct cftype *cft, + struct seq_file *sf) +{ + struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + + blkcg_print_blkgs(sf, blkcg, tg_prfill_io_queued, _policy_throtl, + cft->private, true); + return 0; +} + static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd, int off) { @@ -1085,6 +1121,10 @@ static struct cftype throtl_files[] = { .private = offsetof(struct tg_stats_cpu, serviced), .read_seq_string = tg_print_cpu_rwstat, }, + { + .name = "throttle.io_queued", + .read_seq_string = tg_print_io_queued, + }, { } /* terminate */ }; -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH V3] block/throttle: Add IO throttled information in blkio.throttle
From: Robin Dong san...@taobao.com Currently, if the IO is throttled by io-throttle, the SA has no idea of the situation and can't report it to the real application user about that he/she has to do something. So this patch adds a new interface named blkio.throttle.io_queued which indicates how many IOs are currently throttled. The nr_queued[] of struct throtl_grp is of type unsigned int and updates to it are atomic both at 32bit and 64bit platforms, so we could just read tg-nr_queued only under blkcg-lock. Changelog from v2: Use nr-queued[] of struct throtl_grp for stats instaed of adding new blkg_rwstat. Cc: Tejun Heo t...@kernel.org Cc: Vivek Goyal vgo...@redhat.com Cc: Jens Axboe ax...@kernel.dk Signed-off-by: Tao Ma boyu...@taobao.com Signed-off-by: Robin Dong san...@taobao.com --- block/blk-throttle.c | 40 1 files changed, 40 insertions(+), 0 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index a9664fa..e410448 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -953,6 +953,32 @@ static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, return __blkg_prfill_rwstat(sf, pd, rwstat); } +static u64 tg_prfill_io_queued(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + static const char *rwstr[] = { + [READ] = Read, + [WRITE] = Write, + }; + struct throtl_grp *tg = pd_to_tg(pd); + const char *dname = NULL; + unsigned int v; + int i; + + if (pd-blkg-q-backing_dev_info.dev) + dname = dev_name(pd-blkg-q-backing_dev_info.dev); + + if (!dname) + return 0; + + for (i = 0; i = WRITE; i++) + seq_printf(sf, %s %s %u\n, dname, rwstr[i], tg-nr_queued[i]); + + v = tg-nr_queued[READ] + tg-nr_queued[WRITE]; + seq_printf(sf, %s Total %u\n, dname, v); + return v; +} + static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft, struct seq_file *sf) { @@ -963,6 +989,16 @@ static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft, return 0; } +static int tg_print_io_queued(struct cgroup *cgrp, struct cftype *cft, + struct seq_file *sf) +{ + struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + + blkcg_print_blkgs(sf, blkcg, tg_prfill_io_queued, blkcg_policy_throtl, + cft-private, true); + return 0; +} + static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd, int off) { @@ -1085,6 +1121,10 @@ static struct cftype throtl_files[] = { .private = offsetof(struct tg_stats_cpu, serviced), .read_seq_string = tg_print_cpu_rwstat, }, + { + .name = throttle.io_queued, + .read_seq_string = tg_print_io_queued, + }, { } /* terminate */ }; -- 1.7.1 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 2/2] md: modify dm_io() so it could return bios instead of submitting it
From: Robin Dong When trying to modify flashcache to request based (current it's bio based), we need to make request from bios by ourselves, but dm_io() will submit these bios directly, so we propose to modify the dm_io() to return bios instead of submiting it. This could also improve the flexibility of dm_io(). Signed-off-by: Robin Dong --- drivers/md/dm.c | 11 +++ include/linux/device-mapper.h |3 +++ 2 files changed, 14 insertions(+), 0 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 4e09b6f..bf6e3bb 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1459,11 +1459,22 @@ static int dm_request_based(struct mapped_device *md) static void dm_request(struct request_queue *q, struct bio *bio) { struct mapped_device *md = q->queuedata; + struct dm_table *map = dm_get_live_table(md); + struct dm_target *ti = dm_table_find_target(map, bio->bi_sector); + + if (ti->type->mk_rq) { + ti->type->mk_rq(ti, q, bio); + goto out; + } if (dm_request_based(md)) blk_queue_bio(q, bio); else _dm_request(q, bio); + +out: + dm_table_put(map); + return; } void dm_dispatch_request(struct request *rq) diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 38d27a1..2386389 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -50,6 +50,8 @@ typedef int (*dm_map_fn) (struct dm_target *ti, struct bio *bio, union map_info *map_context); typedef int (*dm_map_request_fn) (struct dm_target *ti, struct request *clone, union map_info *map_context); +typedef int (*dm_make_request_fn) (struct dm_target *ti, + struct request_queue *q, struct bio *bio); /* * Returns: @@ -136,6 +138,7 @@ struct target_type { dm_dtr_fn dtr; dm_map_fn map; dm_map_request_fn map_rq; + dm_make_request_fn mk_rq; dm_endio_fn end_io; dm_request_endio_fn rq_end_io; dm_presuspend_fn presuspend; -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 1/2] md: add new interface 'mk_rq' in target_type
From: Robin Dong We are now trying to modify flashcache(https://github.com/facebook/flashcache) to make it request based so that we can let cfq io-controller control the bandwidth between different io cgroups. A search in the dm directory tells me that only multipath is a request based dm target and its functionality is very simple and map_rq() is used to map the request to different underlying devices. We can't work in this way because: 1. the request which processed by map_rq() need to be issued to different lower devices (disk device and cache device, in flashcache), therefore the request can't be totally remapped by simply changing its queue and returning DM_MAPIO_REMAPPED in map_rq() like multipath_map() 2. to submit bios drectly in map_rq() (by return DM_MAPIO_SUBMITTED) will cause BUG_ON(!irqs_disabled()) in dm_request_fn() because the submit_bio()->generic_make_request()->blk_queue_bio() will definitly call spin_unlock_irq to enable the irqs As above,the interface map_rq() provided by devcie-mapper framework is not enough for an autonomous target, like flashcache. We propose to add a new mk_rq interface so that we can make the requests by ourselves. Signed-off-by: Robin Dong --- drivers/md/dm-io.c| 58 drivers/md/dm-log.c |1 + include/linux/dm-io.h |3 ++ 3 files changed, 38 insertions(+), 24 deletions(-) diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index ea5dd28..f767792 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -287,8 +287,8 @@ static void km_dp_init(struct dpages *dp, void *data) /*- * IO routines that accept a list of pages. *---*/ -static void do_region(int rw, unsigned region, struct dm_io_region *where, - struct dpages *dp, struct io *io) +static void do_region(struct dm_io_request *io_req, unsigned region, + struct dm_io_region *where, struct dpages *dp, struct io *io) { struct bio *bio; struct page *page; @@ -298,6 +298,7 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where, sector_t remaining = where->count; struct request_queue *q = bdev_get_queue(where->bdev); sector_t discard_sectors; + int rw = io_req->bi_rw; /* * where->count may be zero if rw holds a flush and we need to @@ -339,15 +340,26 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where, } atomic_inc(>count); - submit_bio(rw, bio); + if (!io_req->only_create_bio) + submit_bio(rw, bio); + else { + bio->bi_rw |= rw; + if (io_req->start) { + io_req->end->bi_next = bio; + io_req->end = bio; + } else + io_req->start = io_req->end = bio; + bio->bi_next = NULL; + } } while (remaining); } -static void dispatch_io(int rw, unsigned int num_regions, +static void dispatch_io(struct dm_io_request *io_req, unsigned int num_regions, struct dm_io_region *where, struct dpages *dp, struct io *io, int sync) { int i; + int rw = io_req->bi_rw; struct dpages old_pages = *dp; BUG_ON(num_regions > DM_IO_MAX_REGIONS); @@ -362,7 +374,7 @@ static void dispatch_io(int rw, unsigned int num_regions, for (i = 0; i < num_regions; i++) { *dp = old_pages; if (where[i].count || (rw & REQ_FLUSH)) - do_region(rw, i, where + i, dp, io); + do_region(io_req, i, where + i, dp, io); } /* @@ -372,8 +384,8 @@ static void dispatch_io(int rw, unsigned int num_regions, dec_count(io, 0, 0); } -static int sync_io(struct dm_io_client *client, unsigned int num_regions, - struct dm_io_region *where, int rw, struct dpages *dp, +static int sync_io(struct dm_io_request *io_req, unsigned int num_regions, + struct dm_io_region *where, struct dpages *dp, unsigned long *error_bits) { /* @@ -385,7 +397,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, volatile char io_[sizeof(struct io) + __alignof__(struct io) - 1]; struct io *io = (struct io *)PTR_ALIGN(_, __alignof__(struct io)); - if (num_regions > 1 && (rw & RW_MASK) != WRITE) { + if (num_regions > 1 && (io_req->bi_rw & RW_MASK) != WRITE) { WARN_ON(1); return -EIO; } @@ -393,12 +4
[PATCH 1/2] md: add new interface 'mk_rq' in target_type
From: Robin Dong san...@taobao.com We are now trying to modify flashcache(https://github.com/facebook/flashcache) to make it request based so that we can let cfq io-controller control the bandwidth between different io cgroups. A search in the dm directory tells me that only multipath is a request based dm target and its functionality is very simple and map_rq() is used to map the request to different underlying devices. We can't work in this way because: 1. the request which processed by map_rq() need to be issued to different lower devices (disk device and cache device, in flashcache), therefore the request can't be totally remapped by simply changing its queue and returning DM_MAPIO_REMAPPED in map_rq() like multipath_map() 2. to submit bios drectly in map_rq() (by return DM_MAPIO_SUBMITTED) will cause BUG_ON(!irqs_disabled()) in dm_request_fn() because the submit_bio()-generic_make_request()-blk_queue_bio() will definitly call spin_unlock_irq to enable the irqs As above,the interface map_rq() provided by devcie-mapper framework is not enough for an autonomous target, like flashcache. We propose to add a new mk_rq interface so that we can make the requests by ourselves. Signed-off-by: Robin Dong san...@taobao.com --- drivers/md/dm-io.c| 58 drivers/md/dm-log.c |1 + include/linux/dm-io.h |3 ++ 3 files changed, 38 insertions(+), 24 deletions(-) diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index ea5dd28..f767792 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -287,8 +287,8 @@ static void km_dp_init(struct dpages *dp, void *data) /*- * IO routines that accept a list of pages. *---*/ -static void do_region(int rw, unsigned region, struct dm_io_region *where, - struct dpages *dp, struct io *io) +static void do_region(struct dm_io_request *io_req, unsigned region, + struct dm_io_region *where, struct dpages *dp, struct io *io) { struct bio *bio; struct page *page; @@ -298,6 +298,7 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where, sector_t remaining = where-count; struct request_queue *q = bdev_get_queue(where-bdev); sector_t discard_sectors; + int rw = io_req-bi_rw; /* * where-count may be zero if rw holds a flush and we need to @@ -339,15 +340,26 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where, } atomic_inc(io-count); - submit_bio(rw, bio); + if (!io_req-only_create_bio) + submit_bio(rw, bio); + else { + bio-bi_rw |= rw; + if (io_req-start) { + io_req-end-bi_next = bio; + io_req-end = bio; + } else + io_req-start = io_req-end = bio; + bio-bi_next = NULL; + } } while (remaining); } -static void dispatch_io(int rw, unsigned int num_regions, +static void dispatch_io(struct dm_io_request *io_req, unsigned int num_regions, struct dm_io_region *where, struct dpages *dp, struct io *io, int sync) { int i; + int rw = io_req-bi_rw; struct dpages old_pages = *dp; BUG_ON(num_regions DM_IO_MAX_REGIONS); @@ -362,7 +374,7 @@ static void dispatch_io(int rw, unsigned int num_regions, for (i = 0; i num_regions; i++) { *dp = old_pages; if (where[i].count || (rw REQ_FLUSH)) - do_region(rw, i, where + i, dp, io); + do_region(io_req, i, where + i, dp, io); } /* @@ -372,8 +384,8 @@ static void dispatch_io(int rw, unsigned int num_regions, dec_count(io, 0, 0); } -static int sync_io(struct dm_io_client *client, unsigned int num_regions, - struct dm_io_region *where, int rw, struct dpages *dp, +static int sync_io(struct dm_io_request *io_req, unsigned int num_regions, + struct dm_io_region *where, struct dpages *dp, unsigned long *error_bits) { /* @@ -385,7 +397,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, volatile char io_[sizeof(struct io) + __alignof__(struct io) - 1]; struct io *io = (struct io *)PTR_ALIGN(io_, __alignof__(struct io)); - if (num_regions 1 (rw RW_MASK) != WRITE) { + if (num_regions 1 (io_req-bi_rw RW_MASK) != WRITE) { WARN_ON(1); return -EIO; } @@ -393,12 +405,12 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, io
[PATCH 2/2] md: modify dm_io() so it could return bios instead of submitting it
From: Robin Dong san...@taobao.com When trying to modify flashcache to request based (current it's bio based), we need to make request from bios by ourselves, but dm_io() will submit these bios directly, so we propose to modify the dm_io() to return bios instead of submiting it. This could also improve the flexibility of dm_io(). Signed-off-by: Robin Dong san...@taobao.com --- drivers/md/dm.c | 11 +++ include/linux/device-mapper.h |3 +++ 2 files changed, 14 insertions(+), 0 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 4e09b6f..bf6e3bb 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1459,11 +1459,22 @@ static int dm_request_based(struct mapped_device *md) static void dm_request(struct request_queue *q, struct bio *bio) { struct mapped_device *md = q-queuedata; + struct dm_table *map = dm_get_live_table(md); + struct dm_target *ti = dm_table_find_target(map, bio-bi_sector); + + if (ti-type-mk_rq) { + ti-type-mk_rq(ti, q, bio); + goto out; + } if (dm_request_based(md)) blk_queue_bio(q, bio); else _dm_request(q, bio); + +out: + dm_table_put(map); + return; } void dm_dispatch_request(struct request *rq) diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 38d27a1..2386389 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -50,6 +50,8 @@ typedef int (*dm_map_fn) (struct dm_target *ti, struct bio *bio, union map_info *map_context); typedef int (*dm_map_request_fn) (struct dm_target *ti, struct request *clone, union map_info *map_context); +typedef int (*dm_make_request_fn) (struct dm_target *ti, + struct request_queue *q, struct bio *bio); /* * Returns: @@ -136,6 +138,7 @@ struct target_type { dm_dtr_fn dtr; dm_map_fn map; dm_map_request_fn map_rq; + dm_make_request_fn mk_rq; dm_endio_fn end_io; dm_request_endio_fn rq_end_io; dm_presuspend_fn presuspend; -- 1.7.1 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 2/2] md: modify dm_io() so it could return bios instead of submitting it
From: Robin Dong When trying to modify flashcache to request based (current it's bio based), we need to make request from bios by ourselves, but dm_io() will submit these bios directly, so we propose to modify the dm_io() to return bios instead of submiting it. This could also improve the flexibility of dm_io(). Signed-off-by: Robin Dong --- drivers/md/dm-bufio.c |2 + drivers/md/dm-io.c | 58 +++ drivers/md/dm-kcopyd.c |1 + drivers/md/dm-log.c |1 + drivers/md/dm-raid1.c |3 ++ drivers/md/dm-snap-persistent.c |1 + include/linux/dm-io.h |3 ++ 7 files changed, 45 insertions(+), 24 deletions(-) diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index cc06a1e..f5867b9 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -487,6 +487,7 @@ static void use_dmio(struct dm_buffer *b, int rw, sector_t block, .notify.fn = dmio_complete, .notify.context = b, .client = b->c->dm_io, + .submit_bio = 1, }; struct dm_io_region region = { .bdev = b->c->bdev, @@ -1200,6 +1201,7 @@ int dm_bufio_issue_flush(struct dm_bufio_client *c) .mem.type = DM_IO_KMEM, .mem.ptr.addr = NULL, .client = c->dm_io, + .submit_bio = 1, }; struct dm_io_region io_reg = { .bdev = c->bdev, diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index ea5dd28..f235182 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -287,8 +287,8 @@ static void km_dp_init(struct dpages *dp, void *data) /*- * IO routines that accept a list of pages. *---*/ -static void do_region(int rw, unsigned region, struct dm_io_region *where, - struct dpages *dp, struct io *io) +static void do_region(struct dm_io_request *io_req, unsigned region, + struct dm_io_region *where, struct dpages *dp, struct io *io) { struct bio *bio; struct page *page; @@ -298,6 +298,7 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where, sector_t remaining = where->count; struct request_queue *q = bdev_get_queue(where->bdev); sector_t discard_sectors; + int rw = io_req->bi_rw; /* * where->count may be zero if rw holds a flush and we need to @@ -339,15 +340,26 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where, } atomic_inc(>count); - submit_bio(rw, bio); + if (io_req->submit_bio) + submit_bio(rw, bio); + else { + bio->bi_rw |= rw; + if (io_req->start) { + io_req->end->bi_next = bio; + io_req->end = bio; + } else + io_req->start = io_req->end = bio; + bio->bi_next = NULL; + } } while (remaining); } -static void dispatch_io(int rw, unsigned int num_regions, +static void dispatch_io(struct dm_io_request *io_req, unsigned int num_regions, struct dm_io_region *where, struct dpages *dp, struct io *io, int sync) { int i; + int rw = io_req->bi_rw; struct dpages old_pages = *dp; BUG_ON(num_regions > DM_IO_MAX_REGIONS); @@ -362,7 +374,7 @@ static void dispatch_io(int rw, unsigned int num_regions, for (i = 0; i < num_regions; i++) { *dp = old_pages; if (where[i].count || (rw & REQ_FLUSH)) - do_region(rw, i, where + i, dp, io); + do_region(io_req, i, where + i, dp, io); } /* @@ -372,8 +384,8 @@ static void dispatch_io(int rw, unsigned int num_regions, dec_count(io, 0, 0); } -static int sync_io(struct dm_io_client *client, unsigned int num_regions, - struct dm_io_region *where, int rw, struct dpages *dp, +static int sync_io(struct dm_io_request *io_req, unsigned int num_regions, + struct dm_io_region *where, struct dpages *dp, unsigned long *error_bits) { /* @@ -385,7 +397,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, volatile char io_[sizeof(struct io) + __alignof__(struct io) - 1]; struct io *io = (struct io *)PTR_ALIGN(_, __alignof__(struct io)); - if (num_regions > 1 && (rw & RW_MASK) != WRITE) { + if (num_regions > 1 && (io_req->bi_rw & RW_MASK) != WRITE) {
[PATCH 1/2] md: add new interface 'mk_rq' in target_type
From: Robin Dong We are now trying to modify flashcache(https://github.com/facebook/flashcache) to make it request based so that we can let cfq io-controller control the bandwidth between different io cgroups. A search in the dm directory tells me that only multipath is a request based dm target and its functionality is very simple and map_rq() is used to map the request to different underlying devices. We can't work in this way because: 1. the request which processed by map_rq() need to be issued to different lower devices (disk device and cache device, in flashcache), therefore the request can't be totally remapped by simply changing its queue and returning DM_MAPIO_REMAPPED in map_rq() like multipath_map() 2. to submit bios drectly in map_rq() (by return DM_MAPIO_SUBMITTED) will cause BUG_ON(!irqs_disabled()) in dm_request_fn() because the submit_bio()->generic_make_request()->blk_queue_bio() will definitly call spin_unlock_irq to enable the irqs As above,the interface map_rq() provided by devcie-mapper framework is not enough for an autonomous target, like flashcache. We propose to add a new mk_rq interface so that we can make the requests by ourselves. Signed-off-by: Robin Dong --- drivers/md/dm.c | 10 ++ include/linux/device-mapper.h |3 +++ 2 files changed, 13 insertions(+), 0 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 4e09b6f..3ae67de 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1459,11 +1459,21 @@ static int dm_request_based(struct mapped_device *md) static void dm_request(struct request_queue *q, struct bio *bio) { struct mapped_device *md = q->queuedata; + struct dm_table *map = dm_get_live_table(md); + struct dm_target *ti = dm_table_find_target(map, bio->bi_sector); + + if (ti->type->mk_rq) { + ti->type->mk_rq(ti, q, bio); + goto out; + } if (dm_request_based(md)) blk_queue_bio(q, bio); else _dm_request(q, bio); + +out: + dm_table_put(map); } void dm_dispatch_request(struct request *rq) diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 38d27a1..2386389 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -50,6 +50,8 @@ typedef int (*dm_map_fn) (struct dm_target *ti, struct bio *bio, union map_info *map_context); typedef int (*dm_map_request_fn) (struct dm_target *ti, struct request *clone, union map_info *map_context); +typedef int (*dm_make_request_fn) (struct dm_target *ti, + struct request_queue *q, struct bio *bio); /* * Returns: @@ -136,6 +138,7 @@ struct target_type { dm_dtr_fn dtr; dm_map_fn map; dm_map_request_fn map_rq; + dm_make_request_fn mk_rq; dm_endio_fn end_io; dm_request_endio_fn rq_end_io; dm_presuspend_fn presuspend; -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 1/2] md: add new interface 'mk_rq' in target_type
From: Robin Dong san...@taobao.com We are now trying to modify flashcache(https://github.com/facebook/flashcache) to make it request based so that we can let cfq io-controller control the bandwidth between different io cgroups. A search in the dm directory tells me that only multipath is a request based dm target and its functionality is very simple and map_rq() is used to map the request to different underlying devices. We can't work in this way because: 1. the request which processed by map_rq() need to be issued to different lower devices (disk device and cache device, in flashcache), therefore the request can't be totally remapped by simply changing its queue and returning DM_MAPIO_REMAPPED in map_rq() like multipath_map() 2. to submit bios drectly in map_rq() (by return DM_MAPIO_SUBMITTED) will cause BUG_ON(!irqs_disabled()) in dm_request_fn() because the submit_bio()-generic_make_request()-blk_queue_bio() will definitly call spin_unlock_irq to enable the irqs As above,the interface map_rq() provided by devcie-mapper framework is not enough for an autonomous target, like flashcache. We propose to add a new mk_rq interface so that we can make the requests by ourselves. Signed-off-by: Robin Dong san...@taobao.com --- drivers/md/dm.c | 10 ++ include/linux/device-mapper.h |3 +++ 2 files changed, 13 insertions(+), 0 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 4e09b6f..3ae67de 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1459,11 +1459,21 @@ static int dm_request_based(struct mapped_device *md) static void dm_request(struct request_queue *q, struct bio *bio) { struct mapped_device *md = q-queuedata; + struct dm_table *map = dm_get_live_table(md); + struct dm_target *ti = dm_table_find_target(map, bio-bi_sector); + + if (ti-type-mk_rq) { + ti-type-mk_rq(ti, q, bio); + goto out; + } if (dm_request_based(md)) blk_queue_bio(q, bio); else _dm_request(q, bio); + +out: + dm_table_put(map); } void dm_dispatch_request(struct request *rq) diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 38d27a1..2386389 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -50,6 +50,8 @@ typedef int (*dm_map_fn) (struct dm_target *ti, struct bio *bio, union map_info *map_context); typedef int (*dm_map_request_fn) (struct dm_target *ti, struct request *clone, union map_info *map_context); +typedef int (*dm_make_request_fn) (struct dm_target *ti, + struct request_queue *q, struct bio *bio); /* * Returns: @@ -136,6 +138,7 @@ struct target_type { dm_dtr_fn dtr; dm_map_fn map; dm_map_request_fn map_rq; + dm_make_request_fn mk_rq; dm_endio_fn end_io; dm_request_endio_fn rq_end_io; dm_presuspend_fn presuspend; -- 1.7.1 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 2/2] md: modify dm_io() so it could return bios instead of submitting it
From: Robin Dong san...@taobao.com When trying to modify flashcache to request based (current it's bio based), we need to make request from bios by ourselves, but dm_io() will submit these bios directly, so we propose to modify the dm_io() to return bios instead of submiting it. This could also improve the flexibility of dm_io(). Signed-off-by: Robin Dong san...@taobao.com --- drivers/md/dm-bufio.c |2 + drivers/md/dm-io.c | 58 +++ drivers/md/dm-kcopyd.c |1 + drivers/md/dm-log.c |1 + drivers/md/dm-raid1.c |3 ++ drivers/md/dm-snap-persistent.c |1 + include/linux/dm-io.h |3 ++ 7 files changed, 45 insertions(+), 24 deletions(-) diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index cc06a1e..f5867b9 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -487,6 +487,7 @@ static void use_dmio(struct dm_buffer *b, int rw, sector_t block, .notify.fn = dmio_complete, .notify.context = b, .client = b-c-dm_io, + .submit_bio = 1, }; struct dm_io_region region = { .bdev = b-c-bdev, @@ -1200,6 +1201,7 @@ int dm_bufio_issue_flush(struct dm_bufio_client *c) .mem.type = DM_IO_KMEM, .mem.ptr.addr = NULL, .client = c-dm_io, + .submit_bio = 1, }; struct dm_io_region io_reg = { .bdev = c-bdev, diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index ea5dd28..f235182 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -287,8 +287,8 @@ static void km_dp_init(struct dpages *dp, void *data) /*- * IO routines that accept a list of pages. *---*/ -static void do_region(int rw, unsigned region, struct dm_io_region *where, - struct dpages *dp, struct io *io) +static void do_region(struct dm_io_request *io_req, unsigned region, + struct dm_io_region *where, struct dpages *dp, struct io *io) { struct bio *bio; struct page *page; @@ -298,6 +298,7 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where, sector_t remaining = where-count; struct request_queue *q = bdev_get_queue(where-bdev); sector_t discard_sectors; + int rw = io_req-bi_rw; /* * where-count may be zero if rw holds a flush and we need to @@ -339,15 +340,26 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where, } atomic_inc(io-count); - submit_bio(rw, bio); + if (io_req-submit_bio) + submit_bio(rw, bio); + else { + bio-bi_rw |= rw; + if (io_req-start) { + io_req-end-bi_next = bio; + io_req-end = bio; + } else + io_req-start = io_req-end = bio; + bio-bi_next = NULL; + } } while (remaining); } -static void dispatch_io(int rw, unsigned int num_regions, +static void dispatch_io(struct dm_io_request *io_req, unsigned int num_regions, struct dm_io_region *where, struct dpages *dp, struct io *io, int sync) { int i; + int rw = io_req-bi_rw; struct dpages old_pages = *dp; BUG_ON(num_regions DM_IO_MAX_REGIONS); @@ -362,7 +374,7 @@ static void dispatch_io(int rw, unsigned int num_regions, for (i = 0; i num_regions; i++) { *dp = old_pages; if (where[i].count || (rw REQ_FLUSH)) - do_region(rw, i, where + i, dp, io); + do_region(io_req, i, where + i, dp, io); } /* @@ -372,8 +384,8 @@ static void dispatch_io(int rw, unsigned int num_regions, dec_count(io, 0, 0); } -static int sync_io(struct dm_io_client *client, unsigned int num_regions, - struct dm_io_region *where, int rw, struct dpages *dp, +static int sync_io(struct dm_io_request *io_req, unsigned int num_regions, + struct dm_io_region *where, struct dpages *dp, unsigned long *error_bits) { /* @@ -385,7 +397,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, volatile char io_[sizeof(struct io) + __alignof__(struct io) - 1]; struct io *io = (struct io *)PTR_ALIGN(io_, __alignof__(struct io)); - if (num_regions 1 (rw RW_MASK) != WRITE) { + if (num_regions 1 (io_req-bi_rw RW_MASK) != WRITE) { WARN_ON(1); return -EIO; } @@ -393,12 +405,12 @@ static int sync_io