[RFC v1] add new io-scheduler to use cgroup on high-speed device

2013-06-04 Thread Robin Dong
We want to use blkio.cgroup on high-speed device (like fusionio) for our mysql 
clusters.
After testing different io-scheduler, we found that  cfq is too slow and 
deadline can't run on cgroup.
So we developed a new io-scheduler: tpps (Tiny Parallel Proportion 
Scheduler).It dispatch requests
only by using their individual weight and total weight (proportion) therefore 
it's simply and efficient.

Test case: fusionio card, 4 cgroups, iodepth-512

groupname  weight
test1  1000
test2  800
test3  600
test4  400

Use tpps, the result is:

groupname  iopsavg-rt(ms)   max-rt(ms)
test1  30220   16   54
test2  28261   18   56
test3  26333   19   69
test4  20152   25   87

Use cfq, the result is:

groupname  iopsavg-rt(ms)   max-rt(ms)
test1  16478   30   242
test2  13015   39   347
test3   9300   54   371
test4   5806   87   393

Signed-off-by: Robin Dong 
Signed-off-by: Zhu Yanhai 
Cc: Tejun Heo 
Cc: Vivek Goyal 
Cc: Jens Axboe 
Cc: Tao Ma 
---
 block/Kconfig.iosched  |   13 +
 block/Makefile |1 +
 block/tpps-iosched.c   | 1272 
 include/linux/blkdev.h |2 +-
 4 files changed, 1287 insertions(+), 1 deletions(-)
 create mode 100644 block/tpps-iosched.c

diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 421bef9..e5e28c2 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -21,6 +21,16 @@ config IOSCHED_DEADLINE
  a new point in the service tree and doing a batch of IO from there
  in case of expiry.

+config IOSCHED_TPPS
+   tristate "TPPS I/O scheduler"
+   # If BLK_CGROUP is a module, TPPS has to be built as module.
+   default y
+   ---help---
+ The TPPS I/O scheduler tries to distribute iops proportional
+ among all cgroups in the system. It should also provide a low
+ latency working environment, suitable for flash-based device.
+ Note: If BLK_CGROUP=m, then TPPS can be built only as module.
+
 config IOSCHED_CFQ
tristate "CFQ I/O scheduler"
default y
@@ -49,6 +59,9 @@ choice
config DEFAULT_DEADLINE
bool "Deadline" if IOSCHED_DEADLINE=y

+   config DEFAULT_TPPS
+   bool "Tiny Parallel Proportion" if IOSCHED_TPPS=y
+
config DEFAULT_CFQ
bool "CFQ" if IOSCHED_CFQ=y

diff --git a/block/Makefile b/block/Makefile
index 39b76ba..6e30ef4 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -15,6 +15,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING)  += blk-throttle.o
 obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
 obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
 obj-$(CONFIG_IOSCHED_CFQ)  += cfq-iosched.o
+obj-$(CONFIG_IOSCHED_TPPS) += tpps-iosched.o

 obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o
 obj-$(CONFIG_BLK_DEV_INTEGRITY)+= blk-integrity.o
diff --git a/block/tpps-iosched.c b/block/tpps-iosched.c
new file mode 100644
index 000..981fde2
--- /dev/null
+++ b/block/tpps-iosched.c
@@ -0,0 +1,1272 @@
+/*
+ *  TPPS, or Tiny Parallel Proportion disk Scheduler.
+ *
+ *  Based on ideas from Zhu Yanhai 
+ *
+ *  Copyright (C) 2013 Robin Dong 
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "blk-cgroup.h"
+#include "blk.h"
+
+static struct kmem_cache *tpps_pool;
+
+struct tpps_queue {
+   /* reference count */
+   int ref;
+   /* parent tpps_data */
+   struct tpps_data *tppd;
+   /* tpps_group member */
+   struct list_head tppg_node;
+   /* sorted list of pending requests */
+   struct list_head sort_list;
+   struct tpps_group *tppg;
+   pid_t pid;
+   int online;
+   int rq_queued;
+};
+
+struct tppg_stats {
+   /* total bytes transferred */
+   struct blkg_rwstat  service_bytes;
+   /* total IOs serviced, post merge */
+   struct blkg_rwstat  serviced;
+   /* number of ios merged */
+   struct blkg_rwstat  merged;
+   /* total time spent on device in ns, may not be accurate w/ queueing */
+   struct blkg_rwstat  service_time;
+   /* total time spent waiting in scheduler queue in ns */
+   struct blkg_rwstat  wait_time;
+   /* number of IOs queued up */
+   struct blkg_rwstat  queued;
+   /* total sectors transferred */
+   struct blkg_statsectors;
+   /* total disk time and nr sectors dispatched by this group */
+   struct blkg_stattime;
+};
+
+struct tpps_group {
+   struct blkg_policy_data pd;
+   /* tpps_data member */
+   struct list_head tppd_node;
+   struct list_head *cur_dispatcher;
+
+   unsigned int weight;
+   unsigned int new_weight;
+   unsigned int dev_weight;
+   unsigned int l

[RFC v1] add new io-scheduler to use cgroup on high-speed device

2013-06-04 Thread Robin Dong
From: Robin Dong 

We want to use blkio.cgroup on high-speed device (like fusionio) for our mysql 
clusters.
After testing different io-scheduler, we found that  cfq is too slow and 
deadline can't run on cgroup.
So we developed a new io-scheduler: tpps (Tiny Parallel Proportion 
Scheduler).It dispatch requests
only by using their individual weight and total weight (proportion) therefore 
it's simply and efficient.

Test case: fusionio card, 4 cgroups, iodepth-512

groupname  weight
test1  1000
test2  800
test3  600
test4  400

Use tpps, the result is:

groupname  iopsavg-rt(ms)   max-rt(ms)
test1  30220   16   54 
test2  28261   18   56
test3  26333   19   69
test4  20152   25   87

Use cfq, the result is:

groupname  iopsavg-rt(ms)   max-rt(ms)
test1  16478   30   242
test2  13015   39   347
test3   9300   54   371
test4   5806   87   393

Signed-off-by: Robin Dong 
Signed-off-by: Zhu Yanhai 
Cc: Tejun Heo 
Cc: Vivek Goyal 
Cc: Jens Axboe 
Cc: Tao Ma 
---
 block/Kconfig.iosched  |   13 +
 block/Makefile |1 +
 block/tpps-iosched.c   | 1272 
 include/linux/blkdev.h |2 +-
 4 files changed, 1287 insertions(+), 1 deletions(-)
 create mode 100644 block/tpps-iosched.c

diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 421bef9..e5e28c2 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -21,6 +21,16 @@ config IOSCHED_DEADLINE
  a new point in the service tree and doing a batch of IO from there
  in case of expiry.
 
+config IOSCHED_TPPS
+   tristate "TPPS I/O scheduler"
+   # If BLK_CGROUP is a module, TPPS has to be built as module.
+   default y
+   ---help---
+ The TPPS I/O scheduler tries to distribute iops proportional
+ among all cgroups in the system. It should also provide a low
+ latency working environment, suitable for flash-based device.
+ Note: If BLK_CGROUP=m, then TPPS can be built only as module.
+
 config IOSCHED_CFQ
tristate "CFQ I/O scheduler"
default y
@@ -49,6 +59,9 @@ choice
config DEFAULT_DEADLINE
bool "Deadline" if IOSCHED_DEADLINE=y
 
+   config DEFAULT_TPPS
+   bool "Tiny Parallel Proportion" if IOSCHED_TPPS=y
+
config DEFAULT_CFQ
bool "CFQ" if IOSCHED_CFQ=y
 
diff --git a/block/Makefile b/block/Makefile
index 39b76ba..6e30ef4 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -15,6 +15,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING)  += blk-throttle.o
 obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
 obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
 obj-$(CONFIG_IOSCHED_CFQ)  += cfq-iosched.o
+obj-$(CONFIG_IOSCHED_TPPS) += tpps-iosched.o
 
 obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o
 obj-$(CONFIG_BLK_DEV_INTEGRITY)+= blk-integrity.o
diff --git a/block/tpps-iosched.c b/block/tpps-iosched.c
new file mode 100644
index 000..981fde2
--- /dev/null
+++ b/block/tpps-iosched.c
@@ -0,0 +1,1272 @@
+/*
+ *  TPPS, or Tiny Parallel Proportion disk Scheduler.
+ *
+ *  Based on ideas from Zhu Yanhai 
+ *
+ *  Copyright (C) 2013 Robin Dong 
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "blk-cgroup.h"
+#include "blk.h"
+
+static struct kmem_cache *tpps_pool;
+
+struct tpps_queue {
+   /* reference count */
+   int ref;
+   /* parent tpps_data */
+   struct tpps_data *tppd;
+   /* tpps_group member */
+   struct list_head tppg_node;
+   /* sorted list of pending requests */
+   struct list_head sort_list;
+   struct tpps_group *tppg;
+   pid_t pid;
+   int online;
+   int rq_queued;
+};
+
+struct tppg_stats {
+   /* total bytes transferred */
+   struct blkg_rwstat  service_bytes;
+   /* total IOs serviced, post merge */
+   struct blkg_rwstat  serviced;
+   /* number of ios merged */
+   struct blkg_rwstat  merged;
+   /* total time spent on device in ns, may not be accurate w/ queueing */
+   struct blkg_rwstat  service_time;
+   /* total time spent waiting in scheduler queue in ns */
+   struct blkg_rwstat  wait_time;
+   /* number of IOs queued up */
+   struct blkg_rwstat  queued;
+   /* total sectors transferred */
+   struct blkg_statsectors;
+   /* total disk time and nr sectors dispatched by this group */
+   struct blkg_stattime;
+};
+
+struct tpps_group {
+   struct blkg_policy_data pd;
+   /* tpps_data member */
+   struct list_head tppd_node;
+   struct list_head *cur_dispatcher;
+
+   unsigned int weight;
+   unsigned int new_weight;
+   unsigned 

[RFC v1] add new io-scheduler to use cgroup on high-speed device

2013-06-04 Thread Robin Dong
From: Robin Dong san...@taobao.com

We want to use blkio.cgroup on high-speed device (like fusionio) for our mysql 
clusters.
After testing different io-scheduler, we found that  cfq is too slow and 
deadline can't run on cgroup.
So we developed a new io-scheduler: tpps (Tiny Parallel Proportion 
Scheduler).It dispatch requests
only by using their individual weight and total weight (proportion) therefore 
it's simply and efficient.

Test case: fusionio card, 4 cgroups, iodepth-512

groupname  weight
test1  1000
test2  800
test3  600
test4  400

Use tpps, the result is:

groupname  iopsavg-rt(ms)   max-rt(ms)
test1  30220   16   54 
test2  28261   18   56
test3  26333   19   69
test4  20152   25   87

Use cfq, the result is:

groupname  iopsavg-rt(ms)   max-rt(ms)
test1  16478   30   242
test2  13015   39   347
test3   9300   54   371
test4   5806   87   393

Signed-off-by: Robin Dong san...@taobao.com
Signed-off-by: Zhu Yanhai gaoyang@taobao.com
Cc: Tejun Heo t...@kernel.org
Cc: Vivek Goyal vgo...@redhat.com
Cc: Jens Axboe ax...@kernel.dk
Cc: Tao Ma taoma...@gmail.com
---
 block/Kconfig.iosched  |   13 +
 block/Makefile |1 +
 block/tpps-iosched.c   | 1272 
 include/linux/blkdev.h |2 +-
 4 files changed, 1287 insertions(+), 1 deletions(-)
 create mode 100644 block/tpps-iosched.c

diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 421bef9..e5e28c2 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -21,6 +21,16 @@ config IOSCHED_DEADLINE
  a new point in the service tree and doing a batch of IO from there
  in case of expiry.
 
+config IOSCHED_TPPS
+   tristate TPPS I/O scheduler
+   # If BLK_CGROUP is a module, TPPS has to be built as module.
+   default y
+   ---help---
+ The TPPS I/O scheduler tries to distribute iops proportional
+ among all cgroups in the system. It should also provide a low
+ latency working environment, suitable for flash-based device.
+ Note: If BLK_CGROUP=m, then TPPS can be built only as module.
+
 config IOSCHED_CFQ
tristate CFQ I/O scheduler
default y
@@ -49,6 +59,9 @@ choice
config DEFAULT_DEADLINE
bool Deadline if IOSCHED_DEADLINE=y
 
+   config DEFAULT_TPPS
+   bool Tiny Parallel Proportion if IOSCHED_TPPS=y
+
config DEFAULT_CFQ
bool CFQ if IOSCHED_CFQ=y
 
diff --git a/block/Makefile b/block/Makefile
index 39b76ba..6e30ef4 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -15,6 +15,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING)  += blk-throttle.o
 obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
 obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
 obj-$(CONFIG_IOSCHED_CFQ)  += cfq-iosched.o
+obj-$(CONFIG_IOSCHED_TPPS) += tpps-iosched.o
 
 obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o
 obj-$(CONFIG_BLK_DEV_INTEGRITY)+= blk-integrity.o
diff --git a/block/tpps-iosched.c b/block/tpps-iosched.c
new file mode 100644
index 000..981fde2
--- /dev/null
+++ b/block/tpps-iosched.c
@@ -0,0 +1,1272 @@
+/*
+ *  TPPS, or Tiny Parallel Proportion disk Scheduler.
+ *
+ *  Based on ideas from Zhu Yanhai gaoyang@taobao.com
+ *
+ *  Copyright (C) 2013 Robin Dong san...@taobao.com
+ */
+#include linux/module.h
+#include linux/blkdev.h
+#include linux/elevator.h
+#include linux/jiffies.h
+#include linux/rbtree.h
+#include linux/ioprio.h
+#include linux/blktrace_api.h
+#include blk-cgroup.h
+#include blk.h
+
+static struct kmem_cache *tpps_pool;
+
+struct tpps_queue {
+   /* reference count */
+   int ref;
+   /* parent tpps_data */
+   struct tpps_data *tppd;
+   /* tpps_group member */
+   struct list_head tppg_node;
+   /* sorted list of pending requests */
+   struct list_head sort_list;
+   struct tpps_group *tppg;
+   pid_t pid;
+   int online;
+   int rq_queued;
+};
+
+struct tppg_stats {
+   /* total bytes transferred */
+   struct blkg_rwstat  service_bytes;
+   /* total IOs serviced, post merge */
+   struct blkg_rwstat  serviced;
+   /* number of ios merged */
+   struct blkg_rwstat  merged;
+   /* total time spent on device in ns, may not be accurate w/ queueing */
+   struct blkg_rwstat  service_time;
+   /* total time spent waiting in scheduler queue in ns */
+   struct blkg_rwstat  wait_time;
+   /* number of IOs queued up */
+   struct blkg_rwstat  queued;
+   /* total sectors transferred */
+   struct blkg_statsectors;
+   /* total disk time and nr sectors dispatched by this group */
+   struct blkg_stattime;
+};
+
+struct tpps_group {
+   struct blkg_policy_data pd;
+   /* tpps_data

[RFC v1] add new io-scheduler to use cgroup on high-speed device

2013-06-04 Thread Robin Dong
We want to use blkio.cgroup on high-speed device (like fusionio) for our mysql 
clusters.
After testing different io-scheduler, we found that  cfq is too slow and 
deadline can't run on cgroup.
So we developed a new io-scheduler: tpps (Tiny Parallel Proportion 
Scheduler).It dispatch requests
only by using their individual weight and total weight (proportion) therefore 
it's simply and efficient.

Test case: fusionio card, 4 cgroups, iodepth-512

groupname  weight
test1  1000
test2  800
test3  600
test4  400

Use tpps, the result is:

groupname  iopsavg-rt(ms)   max-rt(ms)
test1  30220   16   54
test2  28261   18   56
test3  26333   19   69
test4  20152   25   87

Use cfq, the result is:

groupname  iopsavg-rt(ms)   max-rt(ms)
test1  16478   30   242
test2  13015   39   347
test3   9300   54   371
test4   5806   87   393

Signed-off-by: Robin Dong san...@taobao.com
Signed-off-by: Zhu Yanhai gaoyang@taobao.com
Cc: Tejun Heo t...@kernel.org
Cc: Vivek Goyal vgo...@redhat.com
Cc: Jens Axboe ax...@kernel.dk
Cc: Tao Ma taoma...@gmail.com
---
 block/Kconfig.iosched  |   13 +
 block/Makefile |1 +
 block/tpps-iosched.c   | 1272 
 include/linux/blkdev.h |2 +-
 4 files changed, 1287 insertions(+), 1 deletions(-)
 create mode 100644 block/tpps-iosched.c

diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 421bef9..e5e28c2 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -21,6 +21,16 @@ config IOSCHED_DEADLINE
  a new point in the service tree and doing a batch of IO from there
  in case of expiry.

+config IOSCHED_TPPS
+   tristate TPPS I/O scheduler
+   # If BLK_CGROUP is a module, TPPS has to be built as module.
+   default y
+   ---help---
+ The TPPS I/O scheduler tries to distribute iops proportional
+ among all cgroups in the system. It should also provide a low
+ latency working environment, suitable for flash-based device.
+ Note: If BLK_CGROUP=m, then TPPS can be built only as module.
+
 config IOSCHED_CFQ
tristate CFQ I/O scheduler
default y
@@ -49,6 +59,9 @@ choice
config DEFAULT_DEADLINE
bool Deadline if IOSCHED_DEADLINE=y

+   config DEFAULT_TPPS
+   bool Tiny Parallel Proportion if IOSCHED_TPPS=y
+
config DEFAULT_CFQ
bool CFQ if IOSCHED_CFQ=y

diff --git a/block/Makefile b/block/Makefile
index 39b76ba..6e30ef4 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -15,6 +15,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING)  += blk-throttle.o
 obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
 obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
 obj-$(CONFIG_IOSCHED_CFQ)  += cfq-iosched.o
+obj-$(CONFIG_IOSCHED_TPPS) += tpps-iosched.o

 obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o
 obj-$(CONFIG_BLK_DEV_INTEGRITY)+= blk-integrity.o
diff --git a/block/tpps-iosched.c b/block/tpps-iosched.c
new file mode 100644
index 000..981fde2
--- /dev/null
+++ b/block/tpps-iosched.c
@@ -0,0 +1,1272 @@
+/*
+ *  TPPS, or Tiny Parallel Proportion disk Scheduler.
+ *
+ *  Based on ideas from Zhu Yanhai gaoyang@taobao.com
+ *
+ *  Copyright (C) 2013 Robin Dong san...@taobao.com
+ */
+#include linux/module.h
+#include linux/blkdev.h
+#include linux/elevator.h
+#include linux/jiffies.h
+#include linux/rbtree.h
+#include linux/ioprio.h
+#include linux/blktrace_api.h
+#include blk-cgroup.h
+#include blk.h
+
+static struct kmem_cache *tpps_pool;
+
+struct tpps_queue {
+   /* reference count */
+   int ref;
+   /* parent tpps_data */
+   struct tpps_data *tppd;
+   /* tpps_group member */
+   struct list_head tppg_node;
+   /* sorted list of pending requests */
+   struct list_head sort_list;
+   struct tpps_group *tppg;
+   pid_t pid;
+   int online;
+   int rq_queued;
+};
+
+struct tppg_stats {
+   /* total bytes transferred */
+   struct blkg_rwstat  service_bytes;
+   /* total IOs serviced, post merge */
+   struct blkg_rwstat  serviced;
+   /* number of ios merged */
+   struct blkg_rwstat  merged;
+   /* total time spent on device in ns, may not be accurate w/ queueing */
+   struct blkg_rwstat  service_time;
+   /* total time spent waiting in scheduler queue in ns */
+   struct blkg_rwstat  wait_time;
+   /* number of IOs queued up */
+   struct blkg_rwstat  queued;
+   /* total sectors transferred */
+   struct blkg_statsectors;
+   /* total disk time and nr sectors dispatched by this group */
+   struct blkg_stattime;
+};
+
+struct tpps_group {
+   struct blkg_policy_data pd;
+   /* tpps_data member */
+   struct list_head

Re: [PATCH 2/2] md: modify dm_io() so it could return bios instead of submitting it

2012-11-13 Thread Robin Dong
ping

2012/9/20 Robin Dong :
> From: Robin Dong 
>
> When trying to modify flashcache to request based (current it's bio based), 
> we need
> to make request from bios by ourselves, but dm_io() will submit these bios 
> directly,
> so we propose to modify the dm_io() to return bios instead of submiting it.
>
> This could also improve the flexibility of dm_io().
>
> Signed-off-by: Robin Dong 
> ---
>  drivers/md/dm.c   |   11 +++
>  include/linux/device-mapper.h |3 +++
>  2 files changed, 14 insertions(+), 0 deletions(-)
>
> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> index 4e09b6f..bf6e3bb 100644
> --- a/drivers/md/dm.c
> +++ b/drivers/md/dm.c
> @@ -1459,11 +1459,22 @@ static int dm_request_based(struct mapped_device *md)
>  static void dm_request(struct request_queue *q, struct bio *bio)
>  {
> struct mapped_device *md = q->queuedata;
> +   struct dm_table *map = dm_get_live_table(md);
> +   struct dm_target *ti = dm_table_find_target(map, bio->bi_sector);
> +
> +   if (ti->type->mk_rq) {
> +   ti->type->mk_rq(ti, q, bio);
> +   goto out;
> +   }
>
> if (dm_request_based(md))
> blk_queue_bio(q, bio);
> else
> _dm_request(q, bio);
> +
> +out:
> +   dm_table_put(map);
> +   return;
>  }
>
>  void dm_dispatch_request(struct request *rq)
> diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
> index 38d27a1..2386389 100644
> --- a/include/linux/device-mapper.h
> +++ b/include/linux/device-mapper.h
> @@ -50,6 +50,8 @@ typedef int (*dm_map_fn) (struct dm_target *ti, struct bio 
> *bio,
>   union map_info *map_context);
>  typedef int (*dm_map_request_fn) (struct dm_target *ti, struct request 
> *clone,
>   union map_info *map_context);
> +typedef int (*dm_make_request_fn) (struct dm_target *ti,
> +   struct request_queue *q, struct bio *bio);
>
>  /*
>   * Returns:
> @@ -136,6 +138,7 @@ struct target_type {
> dm_dtr_fn dtr;
> dm_map_fn map;
> dm_map_request_fn map_rq;
> +   dm_make_request_fn mk_rq;
> dm_endio_fn end_io;
> dm_request_endio_fn rq_end_io;
> dm_presuspend_fn presuspend;
> --
> 1.7.1
>



-- 
--
Best Regard
Robin Dong
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2] md: add new interface 'mk_rq' in target_type

2012-11-13 Thread Robin Dong
ping

2012/9/20 Robin Dong :
> From: Robin Dong 
>
> We are now trying to modify flashcache(https://github.com/facebook/flashcache)
> to make it request based so that
> we can let cfq io-controller control the bandwidth between different
> io cgroups.
>
> A search in the dm directory tells me that only multipath is a request
> based dm target and its functionality
> is very simple and map_rq() is used to map the request to different 
> underlying devices.
> We can't work in this way because:
>
> 1. the request which processed by map_rq() need to be issued to
> different lower devices (disk device and cache device, in 
> flashcache), therefore the request
> can't be totally remapped by simply changing its queue and returning 
> DM_MAPIO_REMAPPED in map_rq() like multipath_map()
> 2. to submit bios drectly in map_rq() (by return DM_MAPIO_SUBMITTED) will 
> cause BUG_ON(!irqs_disabled())
> in dm_request_fn() because the 
> submit_bio()->generic_make_request()->blk_queue_bio() will definitly call 
> spin_unlock_irq to enable the irqs
>
> As above,the interface map_rq() provided by devcie-mapper framework
> is not enough for an autonomous target, like flashcache.
>
> We propose to add a new
> mk_rq interface so that we can make the requests
> by ourselves.
>
> Signed-off-by: Robin Dong 
> ---
>  drivers/md/dm-io.c|   58 
>  drivers/md/dm-log.c   |1 +
>  include/linux/dm-io.h |3 ++
>  3 files changed, 38 insertions(+), 24 deletions(-)
>
> diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
> index ea5dd28..f767792 100644
> --- a/drivers/md/dm-io.c
> +++ b/drivers/md/dm-io.c
> @@ -287,8 +287,8 @@ static void km_dp_init(struct dpages *dp, void *data)
>  /*-
>   * IO routines that accept a list of pages.
>   *---*/
> -static void do_region(int rw, unsigned region, struct dm_io_region *where,
> - struct dpages *dp, struct io *io)
> +static void do_region(struct dm_io_request *io_req, unsigned region,
> +   struct dm_io_region *where, struct dpages *dp, struct io *io)
>  {
> struct bio *bio;
> struct page *page;
> @@ -298,6 +298,7 @@ static void do_region(int rw, unsigned region, struct 
> dm_io_region *where,
> sector_t remaining = where->count;
> struct request_queue *q = bdev_get_queue(where->bdev);
> sector_t discard_sectors;
> +   int rw = io_req->bi_rw;
>
> /*
>  * where->count may be zero if rw holds a flush and we need to
> @@ -339,15 +340,26 @@ static void do_region(int rw, unsigned region, struct 
> dm_io_region *where,
> }
>
> atomic_inc(>count);
> -   submit_bio(rw, bio);
> +   if (!io_req->only_create_bio)
> +   submit_bio(rw, bio);
> +   else {
> +   bio->bi_rw |= rw;
> +   if (io_req->start) {
> +   io_req->end->bi_next = bio;
> +   io_req->end = bio;
> +   } else
> +   io_req->start = io_req->end = bio;
> +   bio->bi_next = NULL;
> +   }
> } while (remaining);
>  }
>
> -static void dispatch_io(int rw, unsigned int num_regions,
> +static void dispatch_io(struct dm_io_request *io_req, unsigned int 
> num_regions,
> struct dm_io_region *where, struct dpages *dp,
> struct io *io, int sync)
>  {
> int i;
> +   int rw = io_req->bi_rw;
> struct dpages old_pages = *dp;
>
> BUG_ON(num_regions > DM_IO_MAX_REGIONS);
> @@ -362,7 +374,7 @@ static void dispatch_io(int rw, unsigned int num_regions,
> for (i = 0; i < num_regions; i++) {
> *dp = old_pages;
> if (where[i].count || (rw & REQ_FLUSH))
> -   do_region(rw, i, where + i, dp, io);
> +   do_region(io_req, i, where + i, dp, io);
> }
>
> /*
> @@ -372,8 +384,8 @@ static void dispatch_io(int rw, unsigned int num_regions,
> dec_count(io, 0, 0);
>  }
>
> -static int sync_io(struct dm_io_client *client, unsigned int num_regions,
> -  struct dm_io_region *where, int rw, struct dpages *dp,
> +static int sync_io(struct dm_io_request *io_req,  unsigned int num_regions,
> +  struct dm_io_region *where, struc

Re: [PATCH 1/2] md: add new interface 'mk_rq' in target_type

2012-11-13 Thread Robin Dong
ping

2012/9/20 Robin Dong robin.k.d...@gmail.com:
 From: Robin Dong san...@taobao.com

 We are now trying to modify flashcache(https://github.com/facebook/flashcache)
 to make it request based so that
 we can let cfq io-controller control the bandwidth between different
 io cgroups.

 A search in the dm directory tells me that only multipath is a request
 based dm target and its functionality
 is very simple and map_rq() is used to map the request to different 
 underlying devices.
 We can't work in this way because:

 1. the request which processed by map_rq() need to be issued to
 different lower devices (disk device and cache device, in 
 flashcache), therefore the request
 can't be totally remapped by simply changing its queue and returning 
 DM_MAPIO_REMAPPED in map_rq() like multipath_map()
 2. to submit bios drectly in map_rq() (by return DM_MAPIO_SUBMITTED) will 
 cause BUG_ON(!irqs_disabled())
 in dm_request_fn() because the 
 submit_bio()-generic_make_request()-blk_queue_bio() will definitly call 
 spin_unlock_irq to enable the irqs

 As above,the interface map_rq() provided by devcie-mapper framework
 is not enough for an autonomous target, like flashcache.

 We propose to add a new
 mk_rq interface so that we can make the requests
 by ourselves.

 Signed-off-by: Robin Dong san...@taobao.com
 ---
  drivers/md/dm-io.c|   58 
  drivers/md/dm-log.c   |1 +
  include/linux/dm-io.h |3 ++
  3 files changed, 38 insertions(+), 24 deletions(-)

 diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
 index ea5dd28..f767792 100644
 --- a/drivers/md/dm-io.c
 +++ b/drivers/md/dm-io.c
 @@ -287,8 +287,8 @@ static void km_dp_init(struct dpages *dp, void *data)
  /*-
   * IO routines that accept a list of pages.
   *---*/
 -static void do_region(int rw, unsigned region, struct dm_io_region *where,
 - struct dpages *dp, struct io *io)
 +static void do_region(struct dm_io_request *io_req, unsigned region,
 +   struct dm_io_region *where, struct dpages *dp, struct io *io)
  {
 struct bio *bio;
 struct page *page;
 @@ -298,6 +298,7 @@ static void do_region(int rw, unsigned region, struct 
 dm_io_region *where,
 sector_t remaining = where-count;
 struct request_queue *q = bdev_get_queue(where-bdev);
 sector_t discard_sectors;
 +   int rw = io_req-bi_rw;

 /*
  * where-count may be zero if rw holds a flush and we need to
 @@ -339,15 +340,26 @@ static void do_region(int rw, unsigned region, struct 
 dm_io_region *where,
 }

 atomic_inc(io-count);
 -   submit_bio(rw, bio);
 +   if (!io_req-only_create_bio)
 +   submit_bio(rw, bio);
 +   else {
 +   bio-bi_rw |= rw;
 +   if (io_req-start) {
 +   io_req-end-bi_next = bio;
 +   io_req-end = bio;
 +   } else
 +   io_req-start = io_req-end = bio;
 +   bio-bi_next = NULL;
 +   }
 } while (remaining);
  }

 -static void dispatch_io(int rw, unsigned int num_regions,
 +static void dispatch_io(struct dm_io_request *io_req, unsigned int 
 num_regions,
 struct dm_io_region *where, struct dpages *dp,
 struct io *io, int sync)
  {
 int i;
 +   int rw = io_req-bi_rw;
 struct dpages old_pages = *dp;

 BUG_ON(num_regions  DM_IO_MAX_REGIONS);
 @@ -362,7 +374,7 @@ static void dispatch_io(int rw, unsigned int num_regions,
 for (i = 0; i  num_regions; i++) {
 *dp = old_pages;
 if (where[i].count || (rw  REQ_FLUSH))
 -   do_region(rw, i, where + i, dp, io);
 +   do_region(io_req, i, where + i, dp, io);
 }

 /*
 @@ -372,8 +384,8 @@ static void dispatch_io(int rw, unsigned int num_regions,
 dec_count(io, 0, 0);
  }

 -static int sync_io(struct dm_io_client *client, unsigned int num_regions,
 -  struct dm_io_region *where, int rw, struct dpages *dp,
 +static int sync_io(struct dm_io_request *io_req,  unsigned int num_regions,
 +  struct dm_io_region *where, struct dpages *dp,
unsigned long *error_bits)
  {
 /*
 @@ -385,7 +397,7 @@ static int sync_io(struct dm_io_client *client, unsigned 
 int num_regions,
 volatile char io_[sizeof(struct io) + __alignof__(struct io) - 1];
 struct io *io = (struct io *)PTR_ALIGN(io_, __alignof__(struct io));

 -   if (num_regions  1  (rw  RW_MASK) != WRITE) {
 +   if (num_regions  1  (io_req-bi_rw  RW_MASK) != WRITE

Re: [PATCH 2/2] md: modify dm_io() so it could return bios instead of submitting it

2012-11-13 Thread Robin Dong
ping

2012/9/20 Robin Dong robin.k.d...@gmail.com:
 From: Robin Dong san...@taobao.com

 When trying to modify flashcache to request based (current it's bio based), 
 we need
 to make request from bios by ourselves, but dm_io() will submit these bios 
 directly,
 so we propose to modify the dm_io() to return bios instead of submiting it.

 This could also improve the flexibility of dm_io().

 Signed-off-by: Robin Dong san...@taobao.com
 ---
  drivers/md/dm.c   |   11 +++
  include/linux/device-mapper.h |3 +++
  2 files changed, 14 insertions(+), 0 deletions(-)

 diff --git a/drivers/md/dm.c b/drivers/md/dm.c
 index 4e09b6f..bf6e3bb 100644
 --- a/drivers/md/dm.c
 +++ b/drivers/md/dm.c
 @@ -1459,11 +1459,22 @@ static int dm_request_based(struct mapped_device *md)
  static void dm_request(struct request_queue *q, struct bio *bio)
  {
 struct mapped_device *md = q-queuedata;
 +   struct dm_table *map = dm_get_live_table(md);
 +   struct dm_target *ti = dm_table_find_target(map, bio-bi_sector);
 +
 +   if (ti-type-mk_rq) {
 +   ti-type-mk_rq(ti, q, bio);
 +   goto out;
 +   }

 if (dm_request_based(md))
 blk_queue_bio(q, bio);
 else
 _dm_request(q, bio);
 +
 +out:
 +   dm_table_put(map);
 +   return;
  }

  void dm_dispatch_request(struct request *rq)
 diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
 index 38d27a1..2386389 100644
 --- a/include/linux/device-mapper.h
 +++ b/include/linux/device-mapper.h
 @@ -50,6 +50,8 @@ typedef int (*dm_map_fn) (struct dm_target *ti, struct bio 
 *bio,
   union map_info *map_context);
  typedef int (*dm_map_request_fn) (struct dm_target *ti, struct request 
 *clone,
   union map_info *map_context);
 +typedef int (*dm_make_request_fn) (struct dm_target *ti,
 +   struct request_queue *q, struct bio *bio);

  /*
   * Returns:
 @@ -136,6 +138,7 @@ struct target_type {
 dm_dtr_fn dtr;
 dm_map_fn map;
 dm_map_request_fn map_rq;
 +   dm_make_request_fn mk_rq;
 dm_endio_fn end_io;
 dm_request_endio_fn rq_end_io;
 dm_presuspend_fn presuspend;
 --
 1.7.1




-- 
--
Best Regard
Robin Dong
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/2 v5] block/throttle: Add IO submitted information in blkio.throttle

2012-11-02 Thread Robin Dong
From: Robin Dong 

Currently, if the IO is throttled by io-throttle, the system admin has no idea
of the situation and can't report it to the real application user about that
he/she has to do something.

So this patch adds a new interface named blkio.throttle.io_submitted which
exposes the number of bios that have been sent into blk-throttle therefore the
user could calculate the difference from throttle.io_serviced to see how many
IOs are currently throttled.

Cc: Tejun Heo 
Cc: Vivek Goyal 
Cc: Jens Axboe 
Signed-off-by: Tao Ma 
Signed-off-by: Robin Dong 
---
v3 <-- v2:
 - Use nr-queued[] of struct throtl_grp for stats instaed of adding new 
blkg_rwstat.

v4 <-- v3:
 - Add two new blkg_rwstat arguments to count total bios be sent into 
blk_throttle.

v5 <-- v4:
 - Change name "io_submit_bytes" to "io_submitted_bytes".

 block/blk-throttle.c |   43 +++
 1 files changed, 43 insertions(+), 0 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 46ddeff..c6391b5 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -46,6 +46,10 @@ struct tg_stats_cpu {
struct blkg_rwstat  service_bytes;
/* total IOs serviced, post merge */
struct blkg_rwstat  serviced;
+   /* total bytes submitted into blk-throttle */
+   struct blkg_rwstat  submit_bytes;
+   /* total IOs submitted into blk-throttle */
+   struct blkg_rwstat  submitted;
 };
 
 struct throtl_grp {
@@ -266,6 +270,8 @@ static void throtl_pd_reset_stats(struct blkcg_gq *blkg)
 
blkg_rwstat_reset(>service_bytes);
blkg_rwstat_reset(>serviced);
+   blkg_rwstat_reset(>submit_bytes);
+   blkg_rwstat_reset(>submitted);
}
 }
 
@@ -699,6 +705,30 @@ static void throtl_update_dispatch_stats(struct throtl_grp 
*tg, u64 bytes,
local_irq_restore(flags);
 }
 
+static void throtl_update_submit_stats(struct throtl_grp *tg, u64 bytes, int 
rw)
+{
+   struct tg_stats_cpu *stats_cpu;
+   unsigned long flags;
+
+   /* If per cpu stats are not allocated yet, don't do any accounting. */
+   if (tg->stats_cpu == NULL)
+   return;
+
+   /*
+* Disabling interrupts to provide mutual exclusion between two
+* writes on same cpu. It probably is not needed for 64bit. Not
+* optimizing that case yet.
+*/
+   local_irq_save(flags);
+
+   stats_cpu = this_cpu_ptr(tg->stats_cpu);
+
+   blkg_rwstat_add(_cpu->submitted, rw, 1);
+   blkg_rwstat_add(_cpu->submit_bytes, rw, bytes);
+
+   local_irq_restore(flags);
+}
+
 static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 {
bool rw = bio_data_dir(bio);
@@ -1084,6 +1114,16 @@ static struct cftype throtl_files[] = {
.private = offsetof(struct tg_stats_cpu, serviced),
.read_seq_string = tg_print_cpu_rwstat,
},
+   {
+   .name = "throttle.io_submitted_bytes",
+   .private = offsetof(struct tg_stats_cpu, submit_bytes),
+   .read_seq_string = tg_print_cpu_rwstat,
+   },
+   {
+   .name = "throttle.io_submitted",
+   .private = offsetof(struct tg_stats_cpu, submitted),
+   .read_seq_string = tg_print_cpu_rwstat,
+   },
{ } /* terminate */
 };
 
@@ -1128,6 +1168,8 @@ bool blk_throtl_bio(struct request_queue *q, struct bio 
*bio)
if (tg_no_rule_group(tg, rw)) {
throtl_update_dispatch_stats(tg,
 bio->bi_size, bio->bi_rw);
+   throtl_update_submit_stats(tg,
+   bio->bi_size, bio->bi_rw);
goto out_unlock_rcu;
}
}
@@ -1141,6 +1183,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio 
*bio)
if (unlikely(!tg))
goto out_unlock;
 
+   throtl_update_submit_stats(tg, bio->bi_size, bio->bi_rw);
if (tg->nr_queued[rw]) {
/*
 * There is already another bio queued in same dir. No
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/2 v5] block/throttle: remove redundant type transition

2012-11-02 Thread Robin Dong
From: Robin Dong 

We don't need to convert tg to blkg and then convert it back in
throtl_update_dispatch_stats().

Signed-off-by: Robin Dong 
---
 block/blk-throttle.c |7 +++
 1 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index a9664fa..46ddeff 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -674,10 +674,9 @@ static bool tg_may_dispatch(struct throtl_data *td, struct 
throtl_grp *tg,
return 0;
 }
 
-static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes,
+static void throtl_update_dispatch_stats(struct throtl_grp *tg, u64 bytes,
 int rw)
 {
-   struct throtl_grp *tg = blkg_to_tg(blkg);
struct tg_stats_cpu *stats_cpu;
unsigned long flags;
 
@@ -708,7 +707,7 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct 
bio *bio)
tg->bytes_disp[rw] += bio->bi_size;
tg->io_disp[rw]++;
 
-   throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, bio->bi_rw);
+   throtl_update_dispatch_stats(tg, bio->bi_size, bio->bi_rw);
 }
 
 static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
@@ -1127,7 +1126,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio 
*bio)
tg = throtl_lookup_tg(td, blkcg);
if (tg) {
if (tg_no_rule_group(tg, rw)) {
-   throtl_update_dispatch_stats(tg_to_blkg(tg),
+   throtl_update_dispatch_stats(tg,
 bio->bi_size, bio->bi_rw);
goto out_unlock_rcu;
}
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/2 v5] block/throttle: remove redundant type transition

2012-11-02 Thread Robin Dong
From: Robin Dong san...@taobao.com

We don't need to convert tg to blkg and then convert it back in
throtl_update_dispatch_stats().

Signed-off-by: Robin Dong san...@taobao.com
---
 block/blk-throttle.c |7 +++
 1 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index a9664fa..46ddeff 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -674,10 +674,9 @@ static bool tg_may_dispatch(struct throtl_data *td, struct 
throtl_grp *tg,
return 0;
 }
 
-static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes,
+static void throtl_update_dispatch_stats(struct throtl_grp *tg, u64 bytes,
 int rw)
 {
-   struct throtl_grp *tg = blkg_to_tg(blkg);
struct tg_stats_cpu *stats_cpu;
unsigned long flags;
 
@@ -708,7 +707,7 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct 
bio *bio)
tg-bytes_disp[rw] += bio-bi_size;
tg-io_disp[rw]++;
 
-   throtl_update_dispatch_stats(tg_to_blkg(tg), bio-bi_size, bio-bi_rw);
+   throtl_update_dispatch_stats(tg, bio-bi_size, bio-bi_rw);
 }
 
 static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
@@ -1127,7 +1126,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio 
*bio)
tg = throtl_lookup_tg(td, blkcg);
if (tg) {
if (tg_no_rule_group(tg, rw)) {
-   throtl_update_dispatch_stats(tg_to_blkg(tg),
+   throtl_update_dispatch_stats(tg,
 bio-bi_size, bio-bi_rw);
goto out_unlock_rcu;
}
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/2 v5] block/throttle: Add IO submitted information in blkio.throttle

2012-11-02 Thread Robin Dong
From: Robin Dong san...@taobao.com

Currently, if the IO is throttled by io-throttle, the system admin has no idea
of the situation and can't report it to the real application user about that
he/she has to do something.

So this patch adds a new interface named blkio.throttle.io_submitted which
exposes the number of bios that have been sent into blk-throttle therefore the
user could calculate the difference from throttle.io_serviced to see how many
IOs are currently throttled.

Cc: Tejun Heo t...@kernel.org
Cc: Vivek Goyal vgo...@redhat.com
Cc: Jens Axboe ax...@kernel.dk
Signed-off-by: Tao Ma boyu...@taobao.com
Signed-off-by: Robin Dong san...@taobao.com
---
v3 -- v2:
 - Use nr-queued[] of struct throtl_grp for stats instaed of adding new 
blkg_rwstat.

v4 -- v3:
 - Add two new blkg_rwstat arguments to count total bios be sent into 
blk_throttle.

v5 -- v4:
 - Change name io_submit_bytes to io_submitted_bytes.

 block/blk-throttle.c |   43 +++
 1 files changed, 43 insertions(+), 0 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 46ddeff..c6391b5 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -46,6 +46,10 @@ struct tg_stats_cpu {
struct blkg_rwstat  service_bytes;
/* total IOs serviced, post merge */
struct blkg_rwstat  serviced;
+   /* total bytes submitted into blk-throttle */
+   struct blkg_rwstat  submit_bytes;
+   /* total IOs submitted into blk-throttle */
+   struct blkg_rwstat  submitted;
 };
 
 struct throtl_grp {
@@ -266,6 +270,8 @@ static void throtl_pd_reset_stats(struct blkcg_gq *blkg)
 
blkg_rwstat_reset(sc-service_bytes);
blkg_rwstat_reset(sc-serviced);
+   blkg_rwstat_reset(sc-submit_bytes);
+   blkg_rwstat_reset(sc-submitted);
}
 }
 
@@ -699,6 +705,30 @@ static void throtl_update_dispatch_stats(struct throtl_grp 
*tg, u64 bytes,
local_irq_restore(flags);
 }
 
+static void throtl_update_submit_stats(struct throtl_grp *tg, u64 bytes, int 
rw)
+{
+   struct tg_stats_cpu *stats_cpu;
+   unsigned long flags;
+
+   /* If per cpu stats are not allocated yet, don't do any accounting. */
+   if (tg-stats_cpu == NULL)
+   return;
+
+   /*
+* Disabling interrupts to provide mutual exclusion between two
+* writes on same cpu. It probably is not needed for 64bit. Not
+* optimizing that case yet.
+*/
+   local_irq_save(flags);
+
+   stats_cpu = this_cpu_ptr(tg-stats_cpu);
+
+   blkg_rwstat_add(stats_cpu-submitted, rw, 1);
+   blkg_rwstat_add(stats_cpu-submit_bytes, rw, bytes);
+
+   local_irq_restore(flags);
+}
+
 static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 {
bool rw = bio_data_dir(bio);
@@ -1084,6 +1114,16 @@ static struct cftype throtl_files[] = {
.private = offsetof(struct tg_stats_cpu, serviced),
.read_seq_string = tg_print_cpu_rwstat,
},
+   {
+   .name = throttle.io_submitted_bytes,
+   .private = offsetof(struct tg_stats_cpu, submit_bytes),
+   .read_seq_string = tg_print_cpu_rwstat,
+   },
+   {
+   .name = throttle.io_submitted,
+   .private = offsetof(struct tg_stats_cpu, submitted),
+   .read_seq_string = tg_print_cpu_rwstat,
+   },
{ } /* terminate */
 };
 
@@ -1128,6 +1168,8 @@ bool blk_throtl_bio(struct request_queue *q, struct bio 
*bio)
if (tg_no_rule_group(tg, rw)) {
throtl_update_dispatch_stats(tg,
 bio-bi_size, bio-bi_rw);
+   throtl_update_submit_stats(tg,
+   bio-bi_size, bio-bi_rw);
goto out_unlock_rcu;
}
}
@@ -1141,6 +1183,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio 
*bio)
if (unlikely(!tg))
goto out_unlock;
 
+   throtl_update_submit_stats(tg, bio-bi_size, bio-bi_rw);
if (tg-nr_queued[rw]) {
/*
 * There is already another bio queued in same dir. No
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/2 v4] block/throttle: Add IO submitted information in blkio.throttle

2012-10-27 Thread Robin Dong
2012/10/27 Vivek Goyal :
> On Fri, Oct 26, 2012 at 12:47:48PM +0800, Robin Dong wrote:
>
> [..]
>> @@ -1084,6 +1114,16 @@ static struct cftype throtl_files[] = {
>>   .private = offsetof(struct tg_stats_cpu, serviced),
>>   .read_seq_string = tg_print_cpu_rwstat,
>>   },
>> + {
>> + .name = "throttle.io_submit_bytes",
>
> Do we really need io_submit_bytes stats? Your need seems to be able to
> figure out if there are pending IOs in the group and if you need to
> increase the bandwidth. For that, isn't number of bios enough?

 I just want to be more consistent to "io_service_bytes" and "io_serviced" pair

>
> Also even if we retain bytes, let us change it to "io_submitted_bytes".

Since "io_service_bytes" is "service", I use "io_submit_bytes"
Never mind, "io_submitted_bytes" is much better.

>
> Thanks
> Vivek



-- 
--
Best Regard
Robin Dong
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/2 v4] block/throttle: Add IO submitted information in blkio.throttle

2012-10-27 Thread Robin Dong
2012/10/27 Vivek Goyal vgo...@redhat.com:
 On Fri, Oct 26, 2012 at 12:47:48PM +0800, Robin Dong wrote:

 [..]
 @@ -1084,6 +1114,16 @@ static struct cftype throtl_files[] = {
   .private = offsetof(struct tg_stats_cpu, serviced),
   .read_seq_string = tg_print_cpu_rwstat,
   },
 + {
 + .name = throttle.io_submit_bytes,

 Do we really need io_submit_bytes stats? Your need seems to be able to
 figure out if there are pending IOs in the group and if you need to
 increase the bandwidth. For that, isn't number of bios enough?

 I just want to be more consistent to io_service_bytes and io_serviced pair


 Also even if we retain bytes, let us change it to io_submitted_bytes.

Since io_service_bytes is service, I use io_submit_bytes
Never mind, io_submitted_bytes is much better.


 Thanks
 Vivek



-- 
--
Best Regard
Robin Dong
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/2 v4] block/throttle: Add IO submitted information in blkio.throttle

2012-10-25 Thread Robin Dong
From: Robin Dong 

Currently, if the IO is throttled by io-throttle, the system admin has no idea
of the situation and can't report it to the real application user about that
he/she has to do something.

So this patch adds a new interface named blkio.throttle.io_submitted which
exposes the number of bios that have been sent into blk-throttle therefore the
user could calculate the difference from throttle.io_serviced to see how many
IOs are currently throttled.

Cc: Tejun Heo 
Cc: Vivek Goyal 
Cc: Jens Axboe 
Signed-off-by: Tao Ma 
Signed-off-by: Robin Dong 
---
v3 <-- v2:
 - Use nr-queued[] of struct throtl_grp for stats instaed of adding new 
blkg_rwstat.

v4 <-- v3:
 - Add two new blkg_rwstat arguments to count total bios be sent into 
blk_throttle.

 block/blk-throttle.c |   43 +++
 1 files changed, 43 insertions(+), 0 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 46ddeff..c6391b5 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -46,6 +46,10 @@ struct tg_stats_cpu {
struct blkg_rwstat  service_bytes;
/* total IOs serviced, post merge */
struct blkg_rwstat  serviced;
+   /* total bytes submitted into blk-throttle */
+   struct blkg_rwstat  submit_bytes;
+   /* total IOs submitted into blk-throttle */
+   struct blkg_rwstat  submitted;
 };
 
 struct throtl_grp {
@@ -266,6 +270,8 @@ static void throtl_pd_reset_stats(struct blkcg_gq *blkg)
 
blkg_rwstat_reset(>service_bytes);
blkg_rwstat_reset(>serviced);
+   blkg_rwstat_reset(>submit_bytes);
+   blkg_rwstat_reset(>submitted);
}
 }
 
@@ -699,6 +705,30 @@ static void throtl_update_dispatch_stats(struct throtl_grp 
*tg, u64 bytes,
local_irq_restore(flags);
 }
 
+static void throtl_update_submit_stats(struct throtl_grp *tg, u64 bytes, int 
rw)
+{
+   struct tg_stats_cpu *stats_cpu;
+   unsigned long flags;
+
+   /* If per cpu stats are not allocated yet, don't do any accounting. */
+   if (tg->stats_cpu == NULL)
+   return;
+
+   /*
+* Disabling interrupts to provide mutual exclusion between two
+* writes on same cpu. It probably is not needed for 64bit. Not
+* optimizing that case yet.
+*/
+   local_irq_save(flags);
+
+   stats_cpu = this_cpu_ptr(tg->stats_cpu);
+
+   blkg_rwstat_add(_cpu->submitted, rw, 1);
+   blkg_rwstat_add(_cpu->submit_bytes, rw, bytes);
+
+   local_irq_restore(flags);
+}
+
 static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 {
bool rw = bio_data_dir(bio);
@@ -1084,6 +1114,16 @@ static struct cftype throtl_files[] = {
.private = offsetof(struct tg_stats_cpu, serviced),
.read_seq_string = tg_print_cpu_rwstat,
},
+   {
+   .name = "throttle.io_submit_bytes",
+   .private = offsetof(struct tg_stats_cpu, submit_bytes),
+   .read_seq_string = tg_print_cpu_rwstat,
+   },
+   {
+   .name = "throttle.io_submitted",
+   .private = offsetof(struct tg_stats_cpu, submitted),
+   .read_seq_string = tg_print_cpu_rwstat,
+   },
{ } /* terminate */
 };
 
@@ -1128,6 +1168,8 @@ bool blk_throtl_bio(struct request_queue *q, struct bio 
*bio)
if (tg_no_rule_group(tg, rw)) {
throtl_update_dispatch_stats(tg,
 bio->bi_size, bio->bi_rw);
+   throtl_update_submit_stats(tg,
+   bio->bi_size, bio->bi_rw);
goto out_unlock_rcu;
}
}
@@ -1141,6 +1183,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio 
*bio)
if (unlikely(!tg))
goto out_unlock;
 
+   throtl_update_submit_stats(tg, bio->bi_size, bio->bi_rw);
if (tg->nr_queued[rw]) {
/*
 * There is already another bio queued in same dir. No
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/2 v4] block/throttle: remove redundant type transition

2012-10-25 Thread Robin Dong
From: Robin Dong 

We don't need to convert tg to blkg and then convert it back in
throtl_update_dispatch_stats().

Signed-off-by: Robin Dong 
---
 block/blk-throttle.c |7 +++
 1 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index a9664fa..46ddeff 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -674,10 +674,9 @@ static bool tg_may_dispatch(struct throtl_data *td, struct 
throtl_grp *tg,
return 0;
 }
 
-static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes,
+static void throtl_update_dispatch_stats(struct throtl_grp *tg, u64 bytes,
 int rw)
 {
-   struct throtl_grp *tg = blkg_to_tg(blkg);
struct tg_stats_cpu *stats_cpu;
unsigned long flags;
 
@@ -708,7 +707,7 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct 
bio *bio)
tg->bytes_disp[rw] += bio->bi_size;
tg->io_disp[rw]++;
 
-   throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, bio->bi_rw);
+   throtl_update_dispatch_stats(tg, bio->bi_size, bio->bi_rw);
 }
 
 static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
@@ -1127,7 +1126,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio 
*bio)
tg = throtl_lookup_tg(td, blkcg);
if (tg) {
if (tg_no_rule_group(tg, rw)) {
-   throtl_update_dispatch_stats(tg_to_blkg(tg),
+   throtl_update_dispatch_stats(tg,
 bio->bi_size, bio->bi_rw);
goto out_unlock_rcu;
}
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/2 v4] block/throttle: remove redundant type transition

2012-10-25 Thread Robin Dong
From: Robin Dong san...@taobao.com

We don't need to convert tg to blkg and then convert it back in
throtl_update_dispatch_stats().

Signed-off-by: Robin Dong san...@taobao.com
---
 block/blk-throttle.c |7 +++
 1 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index a9664fa..46ddeff 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -674,10 +674,9 @@ static bool tg_may_dispatch(struct throtl_data *td, struct 
throtl_grp *tg,
return 0;
 }
 
-static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes,
+static void throtl_update_dispatch_stats(struct throtl_grp *tg, u64 bytes,
 int rw)
 {
-   struct throtl_grp *tg = blkg_to_tg(blkg);
struct tg_stats_cpu *stats_cpu;
unsigned long flags;
 
@@ -708,7 +707,7 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct 
bio *bio)
tg-bytes_disp[rw] += bio-bi_size;
tg-io_disp[rw]++;
 
-   throtl_update_dispatch_stats(tg_to_blkg(tg), bio-bi_size, bio-bi_rw);
+   throtl_update_dispatch_stats(tg, bio-bi_size, bio-bi_rw);
 }
 
 static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
@@ -1127,7 +1126,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio 
*bio)
tg = throtl_lookup_tg(td, blkcg);
if (tg) {
if (tg_no_rule_group(tg, rw)) {
-   throtl_update_dispatch_stats(tg_to_blkg(tg),
+   throtl_update_dispatch_stats(tg,
 bio-bi_size, bio-bi_rw);
goto out_unlock_rcu;
}
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/2 v4] block/throttle: Add IO submitted information in blkio.throttle

2012-10-25 Thread Robin Dong
From: Robin Dong san...@taobao.com

Currently, if the IO is throttled by io-throttle, the system admin has no idea
of the situation and can't report it to the real application user about that
he/she has to do something.

So this patch adds a new interface named blkio.throttle.io_submitted which
exposes the number of bios that have been sent into blk-throttle therefore the
user could calculate the difference from throttle.io_serviced to see how many
IOs are currently throttled.

Cc: Tejun Heo t...@kernel.org
Cc: Vivek Goyal vgo...@redhat.com
Cc: Jens Axboe ax...@kernel.dk
Signed-off-by: Tao Ma boyu...@taobao.com
Signed-off-by: Robin Dong san...@taobao.com
---
v3 -- v2:
 - Use nr-queued[] of struct throtl_grp for stats instaed of adding new 
blkg_rwstat.

v4 -- v3:
 - Add two new blkg_rwstat arguments to count total bios be sent into 
blk_throttle.

 block/blk-throttle.c |   43 +++
 1 files changed, 43 insertions(+), 0 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 46ddeff..c6391b5 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -46,6 +46,10 @@ struct tg_stats_cpu {
struct blkg_rwstat  service_bytes;
/* total IOs serviced, post merge */
struct blkg_rwstat  serviced;
+   /* total bytes submitted into blk-throttle */
+   struct blkg_rwstat  submit_bytes;
+   /* total IOs submitted into blk-throttle */
+   struct blkg_rwstat  submitted;
 };
 
 struct throtl_grp {
@@ -266,6 +270,8 @@ static void throtl_pd_reset_stats(struct blkcg_gq *blkg)
 
blkg_rwstat_reset(sc-service_bytes);
blkg_rwstat_reset(sc-serviced);
+   blkg_rwstat_reset(sc-submit_bytes);
+   blkg_rwstat_reset(sc-submitted);
}
 }
 
@@ -699,6 +705,30 @@ static void throtl_update_dispatch_stats(struct throtl_grp 
*tg, u64 bytes,
local_irq_restore(flags);
 }
 
+static void throtl_update_submit_stats(struct throtl_grp *tg, u64 bytes, int 
rw)
+{
+   struct tg_stats_cpu *stats_cpu;
+   unsigned long flags;
+
+   /* If per cpu stats are not allocated yet, don't do any accounting. */
+   if (tg-stats_cpu == NULL)
+   return;
+
+   /*
+* Disabling interrupts to provide mutual exclusion between two
+* writes on same cpu. It probably is not needed for 64bit. Not
+* optimizing that case yet.
+*/
+   local_irq_save(flags);
+
+   stats_cpu = this_cpu_ptr(tg-stats_cpu);
+
+   blkg_rwstat_add(stats_cpu-submitted, rw, 1);
+   blkg_rwstat_add(stats_cpu-submit_bytes, rw, bytes);
+
+   local_irq_restore(flags);
+}
+
 static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 {
bool rw = bio_data_dir(bio);
@@ -1084,6 +1114,16 @@ static struct cftype throtl_files[] = {
.private = offsetof(struct tg_stats_cpu, serviced),
.read_seq_string = tg_print_cpu_rwstat,
},
+   {
+   .name = throttle.io_submit_bytes,
+   .private = offsetof(struct tg_stats_cpu, submit_bytes),
+   .read_seq_string = tg_print_cpu_rwstat,
+   },
+   {
+   .name = throttle.io_submitted,
+   .private = offsetof(struct tg_stats_cpu, submitted),
+   .read_seq_string = tg_print_cpu_rwstat,
+   },
{ } /* terminate */
 };
 
@@ -1128,6 +1168,8 @@ bool blk_throtl_bio(struct request_queue *q, struct bio 
*bio)
if (tg_no_rule_group(tg, rw)) {
throtl_update_dispatch_stats(tg,
 bio-bi_size, bio-bi_rw);
+   throtl_update_submit_stats(tg,
+   bio-bi_size, bio-bi_rw);
goto out_unlock_rcu;
}
}
@@ -1141,6 +1183,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio 
*bio)
if (unlikely(!tg))
goto out_unlock;
 
+   throtl_update_submit_stats(tg, bio-bi_size, bio-bi_rw);
if (tg-nr_queued[rw]) {
/*
 * There is already another bio queued in same dir. No
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/2 v4] block/throttle: Add IO queued information in blkio.throttle

2012-10-18 Thread Robin Dong
From: Robin Dong 

Currently, if the IO is throttled by io-throttle, the system admin has no idea
of the situation and can't report it to the real application user about that
he/she has to do something.

So this patch adds a new interface named blkio.throttle.io_queued which
exposes the number of bios that have been sent to blk-throttle therefore the
user could calculate the difference from throttle.io_serviced to see how many
IOs are currently throttled.

Cc: Tejun Heo 
Cc: Vivek Goyal 
Cc: Jens Axboe 
Signed-off-by: Tao Ma 
Signed-off-by: Robin Dong 
---
v3 <-- v2:
 - Use nr-queued[] of struct throtl_grp for stats instaed of adding new 
blkg_rwstat.

v4 <-- v3:
 - Add two new blkg_rwstat arguments to count total bios be sent in 
blk_throttle.

 block/blk-throttle.c |   44 
 1 files changed, 44 insertions(+), 0 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 46ddeff..b122b0c 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -70,6 +70,10 @@ struct throtl_grp {
/* Number of queued bios on READ and WRITE lists */
unsigned int nr_queued[2];
 
+   /* The stats of total number queued in blk-throtlle */
+   struct blkg_rwstat io_queue_bytes;
+   struct blkg_rwstat io_queued;
+
/* bytes per second rate limits */
uint64_t bps[2];
 
@@ -267,6 +271,8 @@ static void throtl_pd_reset_stats(struct blkcg_gq *blkg)
blkg_rwstat_reset(>service_bytes);
blkg_rwstat_reset(>serviced);
}
+   blkg_rwstat_reset(>io_queued);
+   blkg_rwstat_reset(>io_queue_bytes);
 }
 
 static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td,
@@ -699,6 +705,12 @@ static void throtl_update_dispatch_stats(struct throtl_grp 
*tg, u64 bytes,
local_irq_restore(flags);
 }
 
+static void throtl_update_queued_stats(struct throtl_grp *tg, u64 bytes, int 
rw)
+{
+   blkg_rwstat_add(>io_queued, rw, 1);
+   blkg_rwstat_add(>io_queue_bytes, rw, bytes);
+}
+
 static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 {
bool rw = bio_data_dir(bio);
@@ -952,6 +964,15 @@ static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
return __blkg_prfill_rwstat(sf, pd, );
 }
 
+static u64 tg_prfill_io_queued(struct seq_file *sf,
+   struct blkg_policy_data *pd, int off)
+{
+   struct throtl_grp *tg = pd_to_tg(pd);
+   struct blkg_rwstat *rwstat = (void *)tg + off;
+
+   return __blkg_prfill_rwstat(sf, pd, rwstat);
+}
+
 static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft,
   struct seq_file *sf)
 {
@@ -962,6 +983,16 @@ static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct 
cftype *cft,
return 0;
 }
 
+static int tg_print_io_queued(struct cgroup *cgrp, struct cftype *cft,
+  struct seq_file *sf)
+{
+   struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+
+   blkcg_print_blkgs(sf, blkcg, tg_prfill_io_queued, _policy_throtl,
+ cft->private, true);
+   return 0;
+}
+
 static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd,
  int off)
 {
@@ -1084,6 +1115,16 @@ static struct cftype throtl_files[] = {
.private = offsetof(struct tg_stats_cpu, serviced),
.read_seq_string = tg_print_cpu_rwstat,
},
+   {
+   .name = "throttle.io_queue_bytes",
+   .private = offsetof(struct throtl_grp, io_queue_bytes),
+   .read_seq_string = tg_print_io_queued,
+   },
+   {
+   .name = "throttle.io_queued",
+   .private = offsetof(struct throtl_grp, io_queued),
+   .read_seq_string = tg_print_io_queued,
+   },
{ } /* terminate */
 };
 
@@ -1128,6 +1169,8 @@ bool blk_throtl_bio(struct request_queue *q, struct bio 
*bio)
if (tg_no_rule_group(tg, rw)) {
throtl_update_dispatch_stats(tg,
 bio->bi_size, bio->bi_rw);
+   throtl_update_queued_stats(tg,
+   bio->bi_size, bio->bi_rw);
goto out_unlock_rcu;
}
}
@@ -1141,6 +1184,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio 
*bio)
if (unlikely(!tg))
goto out_unlock;
 
+   throtl_update_queued_stats(tg, bio->bi_size, bio->bi_rw);
if (tg->nr_queued[rw]) {
/*
 * There is already another bio queued in same dir. No
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/2 v4] block/throttle: remove redundant type transition

2012-10-18 Thread Robin Dong
From: Robin Dong 

We don't need to convert tg to blkg and then convert it back in
throtl_update_dispatch_stats().

Signed-off-by: Robin Dong 
---
 block/blk-throttle.c |7 +++
 1 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index a9664fa..46ddeff 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -674,10 +674,9 @@ static bool tg_may_dispatch(struct throtl_data *td, struct 
throtl_grp *tg,
return 0;
 }
 
-static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes,
+static void throtl_update_dispatch_stats(struct throtl_grp *tg, u64 bytes,
 int rw)
 {
-   struct throtl_grp *tg = blkg_to_tg(blkg);
struct tg_stats_cpu *stats_cpu;
unsigned long flags;
 
@@ -708,7 +707,7 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct 
bio *bio)
tg->bytes_disp[rw] += bio->bi_size;
tg->io_disp[rw]++;
 
-   throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, bio->bi_rw);
+   throtl_update_dispatch_stats(tg, bio->bi_size, bio->bi_rw);
 }
 
 static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
@@ -1127,7 +1126,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio 
*bio)
tg = throtl_lookup_tg(td, blkcg);
if (tg) {
if (tg_no_rule_group(tg, rw)) {
-   throtl_update_dispatch_stats(tg_to_blkg(tg),
+   throtl_update_dispatch_stats(tg,
 bio->bi_size, bio->bi_rw);
goto out_unlock_rcu;
}
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/2 v4] block/throttle: remove redundant type transition

2012-10-18 Thread Robin Dong
From: Robin Dong san...@taobao.com

We don't need to convert tg to blkg and then convert it back in
throtl_update_dispatch_stats().

Signed-off-by: Robin Dong san...@taobao.com
---
 block/blk-throttle.c |7 +++
 1 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index a9664fa..46ddeff 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -674,10 +674,9 @@ static bool tg_may_dispatch(struct throtl_data *td, struct 
throtl_grp *tg,
return 0;
 }
 
-static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes,
+static void throtl_update_dispatch_stats(struct throtl_grp *tg, u64 bytes,
 int rw)
 {
-   struct throtl_grp *tg = blkg_to_tg(blkg);
struct tg_stats_cpu *stats_cpu;
unsigned long flags;
 
@@ -708,7 +707,7 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct 
bio *bio)
tg-bytes_disp[rw] += bio-bi_size;
tg-io_disp[rw]++;
 
-   throtl_update_dispatch_stats(tg_to_blkg(tg), bio-bi_size, bio-bi_rw);
+   throtl_update_dispatch_stats(tg, bio-bi_size, bio-bi_rw);
 }
 
 static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
@@ -1127,7 +1126,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio 
*bio)
tg = throtl_lookup_tg(td, blkcg);
if (tg) {
if (tg_no_rule_group(tg, rw)) {
-   throtl_update_dispatch_stats(tg_to_blkg(tg),
+   throtl_update_dispatch_stats(tg,
 bio-bi_size, bio-bi_rw);
goto out_unlock_rcu;
}
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/2 v4] block/throttle: Add IO queued information in blkio.throttle

2012-10-18 Thread Robin Dong
From: Robin Dong san...@taobao.com

Currently, if the IO is throttled by io-throttle, the system admin has no idea
of the situation and can't report it to the real application user about that
he/she has to do something.

So this patch adds a new interface named blkio.throttle.io_queued which
exposes the number of bios that have been sent to blk-throttle therefore the
user could calculate the difference from throttle.io_serviced to see how many
IOs are currently throttled.

Cc: Tejun Heo t...@kernel.org
Cc: Vivek Goyal vgo...@redhat.com
Cc: Jens Axboe ax...@kernel.dk
Signed-off-by: Tao Ma boyu...@taobao.com
Signed-off-by: Robin Dong san...@taobao.com
---
v3 -- v2:
 - Use nr-queued[] of struct throtl_grp for stats instaed of adding new 
blkg_rwstat.

v4 -- v3:
 - Add two new blkg_rwstat arguments to count total bios be sent in 
blk_throttle.

 block/blk-throttle.c |   44 
 1 files changed, 44 insertions(+), 0 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 46ddeff..b122b0c 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -70,6 +70,10 @@ struct throtl_grp {
/* Number of queued bios on READ and WRITE lists */
unsigned int nr_queued[2];
 
+   /* The stats of total number queued in blk-throtlle */
+   struct blkg_rwstat io_queue_bytes;
+   struct blkg_rwstat io_queued;
+
/* bytes per second rate limits */
uint64_t bps[2];
 
@@ -267,6 +271,8 @@ static void throtl_pd_reset_stats(struct blkcg_gq *blkg)
blkg_rwstat_reset(sc-service_bytes);
blkg_rwstat_reset(sc-serviced);
}
+   blkg_rwstat_reset(tg-io_queued);
+   blkg_rwstat_reset(tg-io_queue_bytes);
 }
 
 static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td,
@@ -699,6 +705,12 @@ static void throtl_update_dispatch_stats(struct throtl_grp 
*tg, u64 bytes,
local_irq_restore(flags);
 }
 
+static void throtl_update_queued_stats(struct throtl_grp *tg, u64 bytes, int 
rw)
+{
+   blkg_rwstat_add(tg-io_queued, rw, 1);
+   blkg_rwstat_add(tg-io_queue_bytes, rw, bytes);
+}
+
 static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 {
bool rw = bio_data_dir(bio);
@@ -952,6 +964,15 @@ static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
return __blkg_prfill_rwstat(sf, pd, rwstat);
 }
 
+static u64 tg_prfill_io_queued(struct seq_file *sf,
+   struct blkg_policy_data *pd, int off)
+{
+   struct throtl_grp *tg = pd_to_tg(pd);
+   struct blkg_rwstat *rwstat = (void *)tg + off;
+
+   return __blkg_prfill_rwstat(sf, pd, rwstat);
+}
+
 static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft,
   struct seq_file *sf)
 {
@@ -962,6 +983,16 @@ static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct 
cftype *cft,
return 0;
 }
 
+static int tg_print_io_queued(struct cgroup *cgrp, struct cftype *cft,
+  struct seq_file *sf)
+{
+   struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+
+   blkcg_print_blkgs(sf, blkcg, tg_prfill_io_queued, blkcg_policy_throtl,
+ cft-private, true);
+   return 0;
+}
+
 static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd,
  int off)
 {
@@ -1084,6 +1115,16 @@ static struct cftype throtl_files[] = {
.private = offsetof(struct tg_stats_cpu, serviced),
.read_seq_string = tg_print_cpu_rwstat,
},
+   {
+   .name = throttle.io_queue_bytes,
+   .private = offsetof(struct throtl_grp, io_queue_bytes),
+   .read_seq_string = tg_print_io_queued,
+   },
+   {
+   .name = throttle.io_queued,
+   .private = offsetof(struct throtl_grp, io_queued),
+   .read_seq_string = tg_print_io_queued,
+   },
{ } /* terminate */
 };
 
@@ -1128,6 +1169,8 @@ bool blk_throtl_bio(struct request_queue *q, struct bio 
*bio)
if (tg_no_rule_group(tg, rw)) {
throtl_update_dispatch_stats(tg,
 bio-bi_size, bio-bi_rw);
+   throtl_update_queued_stats(tg,
+   bio-bi_size, bio-bi_rw);
goto out_unlock_rcu;
}
}
@@ -1141,6 +1184,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio 
*bio)
if (unlikely(!tg))
goto out_unlock;
 
+   throtl_update_queued_stats(tg, bio-bi_size, bio-bi_rw);
if (tg-nr_queued[rw]) {
/*
 * There is already another bio queued in same dir. No
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH V3] block/throttle: Add IO throttled information in blkio.throttle

2012-10-09 Thread Robin Dong
From: Robin Dong 

Currently, if the IO is throttled by io-throttle, the SA has no idea of
the situation and can't report it to the real application user about
that he/she has to do something. So this patch adds a new interface
named blkio.throttle.io_queued which indicates how many IOs are
currently throttled.

The nr_queued[] of struct throtl_grp is of type "unsigned int" and updates
to it are atomic both at 32bit and 64bit platforms, so we could just
read tg->nr_queued only under blkcg->lock.

Changelog from v2:
Use nr-queued[] of struct throtl_grp for stats instaed of adding new 
blkg_rwstat.

Cc: Tejun Heo 
Cc: Vivek Goyal 
Cc: Jens Axboe 
Signed-off-by: Tao Ma 
Signed-off-by: Robin Dong 
---
 block/blk-throttle.c |   40 
 1 files changed, 40 insertions(+), 0 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index a9664fa..e410448 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -953,6 +953,32 @@ static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
return __blkg_prfill_rwstat(sf, pd, );
 }
 
+static u64 tg_prfill_io_queued(struct seq_file *sf,
+   struct blkg_policy_data *pd, int off)
+{
+   static const char *rwstr[] = {
+   [READ]  = "Read",
+   [WRITE] = "Write",
+   };
+   struct throtl_grp *tg = pd_to_tg(pd);
+   const char *dname = NULL;
+   unsigned int v;
+   int i;
+
+   if (pd->blkg->q->backing_dev_info.dev)
+   dname = dev_name(pd->blkg->q->backing_dev_info.dev);
+
+   if (!dname)
+   return 0;
+
+   for (i = 0; i <= WRITE; i++)
+   seq_printf(sf, "%s %s %u\n", dname, rwstr[i], tg->nr_queued[i]);
+
+   v = tg->nr_queued[READ] + tg->nr_queued[WRITE];
+   seq_printf(sf, "%s Total %u\n", dname, v);
+   return v;
+}
+
 static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft,
   struct seq_file *sf)
 {
@@ -963,6 +989,16 @@ static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct 
cftype *cft,
return 0;
 }
 
+static int tg_print_io_queued(struct cgroup *cgrp, struct cftype *cft,
+  struct seq_file *sf)
+{
+   struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+
+   blkcg_print_blkgs(sf, blkcg, tg_prfill_io_queued, _policy_throtl,
+ cft->private, true);
+   return 0;
+}
+
 static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd,
  int off)
 {
@@ -1085,6 +1121,10 @@ static struct cftype throtl_files[] = {
.private = offsetof(struct tg_stats_cpu, serviced),
.read_seq_string = tg_print_cpu_rwstat,
},
+   {
+   .name = "throttle.io_queued",
+   .read_seq_string = tg_print_io_queued,
+   },
{ } /* terminate */
 };
 
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH V3] block/throttle: Add IO throttled information in blkio.throttle

2012-10-09 Thread Robin Dong
From: Robin Dong san...@taobao.com

Currently, if the IO is throttled by io-throttle, the SA has no idea of
the situation and can't report it to the real application user about
that he/she has to do something. So this patch adds a new interface
named blkio.throttle.io_queued which indicates how many IOs are
currently throttled.

The nr_queued[] of struct throtl_grp is of type unsigned int and updates
to it are atomic both at 32bit and 64bit platforms, so we could just
read tg-nr_queued only under blkcg-lock.

Changelog from v2:
Use nr-queued[] of struct throtl_grp for stats instaed of adding new 
blkg_rwstat.

Cc: Tejun Heo t...@kernel.org
Cc: Vivek Goyal vgo...@redhat.com
Cc: Jens Axboe ax...@kernel.dk
Signed-off-by: Tao Ma boyu...@taobao.com
Signed-off-by: Robin Dong san...@taobao.com
---
 block/blk-throttle.c |   40 
 1 files changed, 40 insertions(+), 0 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index a9664fa..e410448 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -953,6 +953,32 @@ static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
return __blkg_prfill_rwstat(sf, pd, rwstat);
 }
 
+static u64 tg_prfill_io_queued(struct seq_file *sf,
+   struct blkg_policy_data *pd, int off)
+{
+   static const char *rwstr[] = {
+   [READ]  = Read,
+   [WRITE] = Write,
+   };
+   struct throtl_grp *tg = pd_to_tg(pd);
+   const char *dname = NULL;
+   unsigned int v;
+   int i;
+
+   if (pd-blkg-q-backing_dev_info.dev)
+   dname = dev_name(pd-blkg-q-backing_dev_info.dev);
+
+   if (!dname)
+   return 0;
+
+   for (i = 0; i = WRITE; i++)
+   seq_printf(sf, %s %s %u\n, dname, rwstr[i], tg-nr_queued[i]);
+
+   v = tg-nr_queued[READ] + tg-nr_queued[WRITE];
+   seq_printf(sf, %s Total %u\n, dname, v);
+   return v;
+}
+
 static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft,
   struct seq_file *sf)
 {
@@ -963,6 +989,16 @@ static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct 
cftype *cft,
return 0;
 }
 
+static int tg_print_io_queued(struct cgroup *cgrp, struct cftype *cft,
+  struct seq_file *sf)
+{
+   struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+
+   blkcg_print_blkgs(sf, blkcg, tg_prfill_io_queued, blkcg_policy_throtl,
+ cft-private, true);
+   return 0;
+}
+
 static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd,
  int off)
 {
@@ -1085,6 +1121,10 @@ static struct cftype throtl_files[] = {
.private = offsetof(struct tg_stats_cpu, serviced),
.read_seq_string = tg_print_cpu_rwstat,
},
+   {
+   .name = throttle.io_queued,
+   .read_seq_string = tg_print_io_queued,
+   },
{ } /* terminate */
 };
 
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/2] md: modify dm_io() so it could return bios instead of submitting it

2012-09-19 Thread Robin Dong
From: Robin Dong 

When trying to modify flashcache to request based (current it's bio based), we 
need
to make request from bios by ourselves, but dm_io() will submit these bios 
directly,
so we propose to modify the dm_io() to return bios instead of submiting it.

This could also improve the flexibility of dm_io().

Signed-off-by: Robin Dong 
---
 drivers/md/dm.c   |   11 +++
 include/linux/device-mapper.h |3 +++
 2 files changed, 14 insertions(+), 0 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 4e09b6f..bf6e3bb 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1459,11 +1459,22 @@ static int dm_request_based(struct mapped_device *md)
 static void dm_request(struct request_queue *q, struct bio *bio)
 {
struct mapped_device *md = q->queuedata;
+   struct dm_table *map = dm_get_live_table(md);
+   struct dm_target *ti = dm_table_find_target(map, bio->bi_sector);
+
+   if (ti->type->mk_rq) {
+   ti->type->mk_rq(ti, q, bio);
+   goto out;
+   }
 
if (dm_request_based(md))
blk_queue_bio(q, bio);
else
_dm_request(q, bio);
+
+out:
+   dm_table_put(map);
+   return;
 }
 
 void dm_dispatch_request(struct request *rq)
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 38d27a1..2386389 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -50,6 +50,8 @@ typedef int (*dm_map_fn) (struct dm_target *ti, struct bio 
*bio,
  union map_info *map_context);
 typedef int (*dm_map_request_fn) (struct dm_target *ti, struct request *clone,
  union map_info *map_context);
+typedef int (*dm_make_request_fn) (struct dm_target *ti,
+   struct request_queue *q, struct bio *bio);
 
 /*
  * Returns:
@@ -136,6 +138,7 @@ struct target_type {
dm_dtr_fn dtr;
dm_map_fn map;
dm_map_request_fn map_rq;
+   dm_make_request_fn mk_rq;
dm_endio_fn end_io;
dm_request_endio_fn rq_end_io;
dm_presuspend_fn presuspend;
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/2] md: add new interface 'mk_rq' in target_type

2012-09-19 Thread Robin Dong
From: Robin Dong 

We are now trying to modify flashcache(https://github.com/facebook/flashcache)
to make it request based so that
we can let cfq io-controller control the bandwidth between different
io cgroups.

A search in the dm directory tells me that only multipath is a request
based dm target and its functionality
is very simple and map_rq() is used to map the request to different underlying 
devices.
We can't work in this way because:

1. the request which processed by map_rq() need to be issued to
different lower devices (disk device and cache device, in flashcache), 
therefore the request
can't be totally remapped by simply changing its queue and returning 
DM_MAPIO_REMAPPED in map_rq() like multipath_map()
2. to submit bios drectly in map_rq() (by return DM_MAPIO_SUBMITTED) will cause 
BUG_ON(!irqs_disabled())
in dm_request_fn() because the 
submit_bio()->generic_make_request()->blk_queue_bio() will definitly call 
spin_unlock_irq to enable the irqs

As above,the interface map_rq() provided by devcie-mapper framework
is not enough for an autonomous target, like flashcache.

We propose to add a new
mk_rq interface so that we can make the requests
by ourselves.

Signed-off-by: Robin Dong 
---
 drivers/md/dm-io.c|   58 
 drivers/md/dm-log.c   |1 +
 include/linux/dm-io.h |3 ++
 3 files changed, 38 insertions(+), 24 deletions(-)

diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index ea5dd28..f767792 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -287,8 +287,8 @@ static void km_dp_init(struct dpages *dp, void *data)
 /*-
  * IO routines that accept a list of pages.
  *---*/
-static void do_region(int rw, unsigned region, struct dm_io_region *where,
- struct dpages *dp, struct io *io)
+static void do_region(struct dm_io_request *io_req, unsigned region,
+   struct dm_io_region *where, struct dpages *dp, struct io *io)
 {
struct bio *bio;
struct page *page;
@@ -298,6 +298,7 @@ static void do_region(int rw, unsigned region, struct 
dm_io_region *where,
sector_t remaining = where->count;
struct request_queue *q = bdev_get_queue(where->bdev);
sector_t discard_sectors;
+   int rw = io_req->bi_rw;
 
/*
 * where->count may be zero if rw holds a flush and we need to
@@ -339,15 +340,26 @@ static void do_region(int rw, unsigned region, struct 
dm_io_region *where,
}
 
atomic_inc(>count);
-   submit_bio(rw, bio);
+   if (!io_req->only_create_bio)
+   submit_bio(rw, bio);
+   else {
+   bio->bi_rw |= rw;
+   if (io_req->start) {
+   io_req->end->bi_next = bio;
+   io_req->end = bio;
+   } else
+   io_req->start = io_req->end = bio;
+   bio->bi_next = NULL;
+   }
} while (remaining);
 }
 
-static void dispatch_io(int rw, unsigned int num_regions,
+static void dispatch_io(struct dm_io_request *io_req, unsigned int num_regions,
struct dm_io_region *where, struct dpages *dp,
struct io *io, int sync)
 {
int i;
+   int rw = io_req->bi_rw;
struct dpages old_pages = *dp;
 
BUG_ON(num_regions > DM_IO_MAX_REGIONS);
@@ -362,7 +374,7 @@ static void dispatch_io(int rw, unsigned int num_regions,
for (i = 0; i < num_regions; i++) {
*dp = old_pages;
if (where[i].count || (rw & REQ_FLUSH))
-   do_region(rw, i, where + i, dp, io);
+   do_region(io_req, i, where + i, dp, io);
}
 
/*
@@ -372,8 +384,8 @@ static void dispatch_io(int rw, unsigned int num_regions,
dec_count(io, 0, 0);
 }
 
-static int sync_io(struct dm_io_client *client, unsigned int num_regions,
-  struct dm_io_region *where, int rw, struct dpages *dp,
+static int sync_io(struct dm_io_request *io_req,  unsigned int num_regions,
+  struct dm_io_region *where, struct dpages *dp,
   unsigned long *error_bits)
 {
/*
@@ -385,7 +397,7 @@ static int sync_io(struct dm_io_client *client, unsigned 
int num_regions,
volatile char io_[sizeof(struct io) + __alignof__(struct io) - 1];
struct io *io = (struct io *)PTR_ALIGN(_, __alignof__(struct io));
 
-   if (num_regions > 1 && (rw & RW_MASK) != WRITE) {
+   if (num_regions > 1 && (io_req->bi_rw & RW_MASK) != WRITE) {
WARN_ON(1);
return -EIO;
}
@@ -393,12 +4

[PATCH 1/2] md: add new interface 'mk_rq' in target_type

2012-09-19 Thread Robin Dong
From: Robin Dong san...@taobao.com

We are now trying to modify flashcache(https://github.com/facebook/flashcache)
to make it request based so that
we can let cfq io-controller control the bandwidth between different
io cgroups.

A search in the dm directory tells me that only multipath is a request
based dm target and its functionality
is very simple and map_rq() is used to map the request to different underlying 
devices.
We can't work in this way because:

1. the request which processed by map_rq() need to be issued to
different lower devices (disk device and cache device, in flashcache), 
therefore the request
can't be totally remapped by simply changing its queue and returning 
DM_MAPIO_REMAPPED in map_rq() like multipath_map()
2. to submit bios drectly in map_rq() (by return DM_MAPIO_SUBMITTED) will cause 
BUG_ON(!irqs_disabled())
in dm_request_fn() because the 
submit_bio()-generic_make_request()-blk_queue_bio() will definitly call 
spin_unlock_irq to enable the irqs

As above,the interface map_rq() provided by devcie-mapper framework
is not enough for an autonomous target, like flashcache.

We propose to add a new
mk_rq interface so that we can make the requests
by ourselves.

Signed-off-by: Robin Dong san...@taobao.com
---
 drivers/md/dm-io.c|   58 
 drivers/md/dm-log.c   |1 +
 include/linux/dm-io.h |3 ++
 3 files changed, 38 insertions(+), 24 deletions(-)

diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index ea5dd28..f767792 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -287,8 +287,8 @@ static void km_dp_init(struct dpages *dp, void *data)
 /*-
  * IO routines that accept a list of pages.
  *---*/
-static void do_region(int rw, unsigned region, struct dm_io_region *where,
- struct dpages *dp, struct io *io)
+static void do_region(struct dm_io_request *io_req, unsigned region,
+   struct dm_io_region *where, struct dpages *dp, struct io *io)
 {
struct bio *bio;
struct page *page;
@@ -298,6 +298,7 @@ static void do_region(int rw, unsigned region, struct 
dm_io_region *where,
sector_t remaining = where-count;
struct request_queue *q = bdev_get_queue(where-bdev);
sector_t discard_sectors;
+   int rw = io_req-bi_rw;
 
/*
 * where-count may be zero if rw holds a flush and we need to
@@ -339,15 +340,26 @@ static void do_region(int rw, unsigned region, struct 
dm_io_region *where,
}
 
atomic_inc(io-count);
-   submit_bio(rw, bio);
+   if (!io_req-only_create_bio)
+   submit_bio(rw, bio);
+   else {
+   bio-bi_rw |= rw;
+   if (io_req-start) {
+   io_req-end-bi_next = bio;
+   io_req-end = bio;
+   } else
+   io_req-start = io_req-end = bio;
+   bio-bi_next = NULL;
+   }
} while (remaining);
 }
 
-static void dispatch_io(int rw, unsigned int num_regions,
+static void dispatch_io(struct dm_io_request *io_req, unsigned int num_regions,
struct dm_io_region *where, struct dpages *dp,
struct io *io, int sync)
 {
int i;
+   int rw = io_req-bi_rw;
struct dpages old_pages = *dp;
 
BUG_ON(num_regions  DM_IO_MAX_REGIONS);
@@ -362,7 +374,7 @@ static void dispatch_io(int rw, unsigned int num_regions,
for (i = 0; i  num_regions; i++) {
*dp = old_pages;
if (where[i].count || (rw  REQ_FLUSH))
-   do_region(rw, i, where + i, dp, io);
+   do_region(io_req, i, where + i, dp, io);
}
 
/*
@@ -372,8 +384,8 @@ static void dispatch_io(int rw, unsigned int num_regions,
dec_count(io, 0, 0);
 }
 
-static int sync_io(struct dm_io_client *client, unsigned int num_regions,
-  struct dm_io_region *where, int rw, struct dpages *dp,
+static int sync_io(struct dm_io_request *io_req,  unsigned int num_regions,
+  struct dm_io_region *where, struct dpages *dp,
   unsigned long *error_bits)
 {
/*
@@ -385,7 +397,7 @@ static int sync_io(struct dm_io_client *client, unsigned 
int num_regions,
volatile char io_[sizeof(struct io) + __alignof__(struct io) - 1];
struct io *io = (struct io *)PTR_ALIGN(io_, __alignof__(struct io));
 
-   if (num_regions  1  (rw  RW_MASK) != WRITE) {
+   if (num_regions  1  (io_req-bi_rw  RW_MASK) != WRITE) {
WARN_ON(1);
return -EIO;
}
@@ -393,12 +405,12 @@ static int sync_io(struct dm_io_client *client, unsigned 
int num_regions,
io

[PATCH 2/2] md: modify dm_io() so it could return bios instead of submitting it

2012-09-19 Thread Robin Dong
From: Robin Dong san...@taobao.com

When trying to modify flashcache to request based (current it's bio based), we 
need
to make request from bios by ourselves, but dm_io() will submit these bios 
directly,
so we propose to modify the dm_io() to return bios instead of submiting it.

This could also improve the flexibility of dm_io().

Signed-off-by: Robin Dong san...@taobao.com
---
 drivers/md/dm.c   |   11 +++
 include/linux/device-mapper.h |3 +++
 2 files changed, 14 insertions(+), 0 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 4e09b6f..bf6e3bb 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1459,11 +1459,22 @@ static int dm_request_based(struct mapped_device *md)
 static void dm_request(struct request_queue *q, struct bio *bio)
 {
struct mapped_device *md = q-queuedata;
+   struct dm_table *map = dm_get_live_table(md);
+   struct dm_target *ti = dm_table_find_target(map, bio-bi_sector);
+
+   if (ti-type-mk_rq) {
+   ti-type-mk_rq(ti, q, bio);
+   goto out;
+   }
 
if (dm_request_based(md))
blk_queue_bio(q, bio);
else
_dm_request(q, bio);
+
+out:
+   dm_table_put(map);
+   return;
 }
 
 void dm_dispatch_request(struct request *rq)
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 38d27a1..2386389 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -50,6 +50,8 @@ typedef int (*dm_map_fn) (struct dm_target *ti, struct bio 
*bio,
  union map_info *map_context);
 typedef int (*dm_map_request_fn) (struct dm_target *ti, struct request *clone,
  union map_info *map_context);
+typedef int (*dm_make_request_fn) (struct dm_target *ti,
+   struct request_queue *q, struct bio *bio);
 
 /*
  * Returns:
@@ -136,6 +138,7 @@ struct target_type {
dm_dtr_fn dtr;
dm_map_fn map;
dm_map_request_fn map_rq;
+   dm_make_request_fn mk_rq;
dm_endio_fn end_io;
dm_request_endio_fn rq_end_io;
dm_presuspend_fn presuspend;
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/2] md: modify dm_io() so it could return bios instead of submitting it

2012-09-02 Thread Robin Dong
From: Robin Dong 

When trying to modify flashcache to request based (current it's bio based), we 
need
to make request from bios by ourselves, but dm_io() will submit these bios 
directly,
so we propose to modify the dm_io() to return bios instead of submiting it.

This could also improve the flexibility of dm_io().

Signed-off-by: Robin Dong 
---
 drivers/md/dm-bufio.c   |2 +
 drivers/md/dm-io.c  |   58 +++
 drivers/md/dm-kcopyd.c  |1 +
 drivers/md/dm-log.c |1 +
 drivers/md/dm-raid1.c   |3 ++
 drivers/md/dm-snap-persistent.c |1 +
 include/linux/dm-io.h   |3 ++
 7 files changed, 45 insertions(+), 24 deletions(-)

diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index cc06a1e..f5867b9 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -487,6 +487,7 @@ static void use_dmio(struct dm_buffer *b, int rw, sector_t 
block,
.notify.fn = dmio_complete,
.notify.context = b,
.client = b->c->dm_io,
+   .submit_bio = 1,
};
struct dm_io_region region = {
.bdev = b->c->bdev,
@@ -1200,6 +1201,7 @@ int dm_bufio_issue_flush(struct dm_bufio_client *c)
.mem.type = DM_IO_KMEM,
.mem.ptr.addr = NULL,
.client = c->dm_io,
+   .submit_bio = 1,
};
struct dm_io_region io_reg = {
.bdev = c->bdev,
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index ea5dd28..f235182 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -287,8 +287,8 @@ static void km_dp_init(struct dpages *dp, void *data)
 /*-
  * IO routines that accept a list of pages.
  *---*/
-static void do_region(int rw, unsigned region, struct dm_io_region *where,
- struct dpages *dp, struct io *io)
+static void do_region(struct dm_io_request *io_req, unsigned region,
+   struct dm_io_region *where, struct dpages *dp, struct io *io)
 {
struct bio *bio;
struct page *page;
@@ -298,6 +298,7 @@ static void do_region(int rw, unsigned region, struct 
dm_io_region *where,
sector_t remaining = where->count;
struct request_queue *q = bdev_get_queue(where->bdev);
sector_t discard_sectors;
+   int rw = io_req->bi_rw;
 
/*
 * where->count may be zero if rw holds a flush and we need to
@@ -339,15 +340,26 @@ static void do_region(int rw, unsigned region, struct 
dm_io_region *where,
}
 
atomic_inc(>count);
-   submit_bio(rw, bio);
+   if (io_req->submit_bio)
+   submit_bio(rw, bio);
+   else {
+   bio->bi_rw |= rw;
+   if (io_req->start) {
+   io_req->end->bi_next = bio;
+   io_req->end = bio;
+   } else
+   io_req->start = io_req->end = bio;
+   bio->bi_next = NULL;
+   }
} while (remaining);
 }
 
-static void dispatch_io(int rw, unsigned int num_regions,
+static void dispatch_io(struct dm_io_request *io_req, unsigned int num_regions,
struct dm_io_region *where, struct dpages *dp,
struct io *io, int sync)
 {
int i;
+   int rw = io_req->bi_rw;
struct dpages old_pages = *dp;
 
BUG_ON(num_regions > DM_IO_MAX_REGIONS);
@@ -362,7 +374,7 @@ static void dispatch_io(int rw, unsigned int num_regions,
for (i = 0; i < num_regions; i++) {
*dp = old_pages;
if (where[i].count || (rw & REQ_FLUSH))
-   do_region(rw, i, where + i, dp, io);
+   do_region(io_req, i, where + i, dp, io);
}
 
/*
@@ -372,8 +384,8 @@ static void dispatch_io(int rw, unsigned int num_regions,
dec_count(io, 0, 0);
 }
 
-static int sync_io(struct dm_io_client *client, unsigned int num_regions,
-  struct dm_io_region *where, int rw, struct dpages *dp,
+static int sync_io(struct dm_io_request *io_req,  unsigned int num_regions,
+  struct dm_io_region *where, struct dpages *dp,
   unsigned long *error_bits)
 {
/*
@@ -385,7 +397,7 @@ static int sync_io(struct dm_io_client *client, unsigned 
int num_regions,
volatile char io_[sizeof(struct io) + __alignof__(struct io) - 1];
struct io *io = (struct io *)PTR_ALIGN(_, __alignof__(struct io));
 
-   if (num_regions > 1 && (rw & RW_MASK) != WRITE) {
+   if (num_regions > 1 && (io_req->bi_rw & RW_MASK) != WRITE) {

[PATCH 1/2] md: add new interface 'mk_rq' in target_type

2012-09-02 Thread Robin Dong
From: Robin Dong 

We are now trying to modify flashcache(https://github.com/facebook/flashcache)
to make it request based so that
we can let cfq io-controller control the bandwidth between different
io cgroups.

A search in the dm directory tells me that only multipath is a request
based dm target and its functionality
is very simple and map_rq() is used to map the request to different underlying 
devices.
We can't work in this way because:

1. the request which processed by map_rq() need to be issued to
different lower devices (disk device and cache device, in flashcache), 
therefore the request
can't be totally remapped by simply changing its queue and returning 
DM_MAPIO_REMAPPED in map_rq() like multipath_map()
2. to submit bios drectly in map_rq() (by return DM_MAPIO_SUBMITTED) will cause 
BUG_ON(!irqs_disabled())
in dm_request_fn() because the 
submit_bio()->generic_make_request()->blk_queue_bio() will definitly call 
spin_unlock_irq to enable the irqs


As above,the interface map_rq() provided by devcie-mapper framework
is not enough for an autonomous target, like flashcache.

We propose to add a new
mk_rq interface so that we can make the requests
by ourselves.

Signed-off-by: Robin Dong 
---
 drivers/md/dm.c   |   10 ++
 include/linux/device-mapper.h |3 +++
 2 files changed, 13 insertions(+), 0 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 4e09b6f..3ae67de 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1459,11 +1459,21 @@ static int dm_request_based(struct mapped_device *md)
 static void dm_request(struct request_queue *q, struct bio *bio)
 {
struct mapped_device *md = q->queuedata;
+   struct dm_table *map = dm_get_live_table(md);
+   struct dm_target *ti = dm_table_find_target(map, bio->bi_sector);
+
+   if (ti->type->mk_rq) {
+   ti->type->mk_rq(ti, q, bio);
+   goto out;
+   }
 
if (dm_request_based(md))
blk_queue_bio(q, bio);
else
_dm_request(q, bio);
+
+out:
+   dm_table_put(map);
 }
 
 void dm_dispatch_request(struct request *rq)
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 38d27a1..2386389 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -50,6 +50,8 @@ typedef int (*dm_map_fn) (struct dm_target *ti, struct bio 
*bio,
  union map_info *map_context);
 typedef int (*dm_map_request_fn) (struct dm_target *ti, struct request *clone,
  union map_info *map_context);
+typedef int (*dm_make_request_fn) (struct dm_target *ti,
+   struct request_queue *q, struct bio *bio);
 
 /*
  * Returns:
@@ -136,6 +138,7 @@ struct target_type {
dm_dtr_fn dtr;
dm_map_fn map;
dm_map_request_fn map_rq;
+   dm_make_request_fn mk_rq;
dm_endio_fn end_io;
dm_request_endio_fn rq_end_io;
dm_presuspend_fn presuspend;
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/2] md: add new interface 'mk_rq' in target_type

2012-09-02 Thread Robin Dong
From: Robin Dong san...@taobao.com

We are now trying to modify flashcache(https://github.com/facebook/flashcache)
to make it request based so that
we can let cfq io-controller control the bandwidth between different
io cgroups.

A search in the dm directory tells me that only multipath is a request
based dm target and its functionality
is very simple and map_rq() is used to map the request to different underlying 
devices.
We can't work in this way because:

1. the request which processed by map_rq() need to be issued to
different lower devices (disk device and cache device, in flashcache), 
therefore the request
can't be totally remapped by simply changing its queue and returning 
DM_MAPIO_REMAPPED in map_rq() like multipath_map()
2. to submit bios drectly in map_rq() (by return DM_MAPIO_SUBMITTED) will cause 
BUG_ON(!irqs_disabled())
in dm_request_fn() because the 
submit_bio()-generic_make_request()-blk_queue_bio() will definitly call 
spin_unlock_irq to enable the irqs


As above,the interface map_rq() provided by devcie-mapper framework
is not enough for an autonomous target, like flashcache.

We propose to add a new
mk_rq interface so that we can make the requests
by ourselves.

Signed-off-by: Robin Dong san...@taobao.com
---
 drivers/md/dm.c   |   10 ++
 include/linux/device-mapper.h |3 +++
 2 files changed, 13 insertions(+), 0 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 4e09b6f..3ae67de 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1459,11 +1459,21 @@ static int dm_request_based(struct mapped_device *md)
 static void dm_request(struct request_queue *q, struct bio *bio)
 {
struct mapped_device *md = q-queuedata;
+   struct dm_table *map = dm_get_live_table(md);
+   struct dm_target *ti = dm_table_find_target(map, bio-bi_sector);
+
+   if (ti-type-mk_rq) {
+   ti-type-mk_rq(ti, q, bio);
+   goto out;
+   }
 
if (dm_request_based(md))
blk_queue_bio(q, bio);
else
_dm_request(q, bio);
+
+out:
+   dm_table_put(map);
 }
 
 void dm_dispatch_request(struct request *rq)
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 38d27a1..2386389 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -50,6 +50,8 @@ typedef int (*dm_map_fn) (struct dm_target *ti, struct bio 
*bio,
  union map_info *map_context);
 typedef int (*dm_map_request_fn) (struct dm_target *ti, struct request *clone,
  union map_info *map_context);
+typedef int (*dm_make_request_fn) (struct dm_target *ti,
+   struct request_queue *q, struct bio *bio);
 
 /*
  * Returns:
@@ -136,6 +138,7 @@ struct target_type {
dm_dtr_fn dtr;
dm_map_fn map;
dm_map_request_fn map_rq;
+   dm_make_request_fn mk_rq;
dm_endio_fn end_io;
dm_request_endio_fn rq_end_io;
dm_presuspend_fn presuspend;
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/2] md: modify dm_io() so it could return bios instead of submitting it

2012-09-02 Thread Robin Dong
From: Robin Dong san...@taobao.com

When trying to modify flashcache to request based (current it's bio based), we 
need
to make request from bios by ourselves, but dm_io() will submit these bios 
directly,
so we propose to modify the dm_io() to return bios instead of submiting it.

This could also improve the flexibility of dm_io().

Signed-off-by: Robin Dong san...@taobao.com
---
 drivers/md/dm-bufio.c   |2 +
 drivers/md/dm-io.c  |   58 +++
 drivers/md/dm-kcopyd.c  |1 +
 drivers/md/dm-log.c |1 +
 drivers/md/dm-raid1.c   |3 ++
 drivers/md/dm-snap-persistent.c |1 +
 include/linux/dm-io.h   |3 ++
 7 files changed, 45 insertions(+), 24 deletions(-)

diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index cc06a1e..f5867b9 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -487,6 +487,7 @@ static void use_dmio(struct dm_buffer *b, int rw, sector_t 
block,
.notify.fn = dmio_complete,
.notify.context = b,
.client = b-c-dm_io,
+   .submit_bio = 1,
};
struct dm_io_region region = {
.bdev = b-c-bdev,
@@ -1200,6 +1201,7 @@ int dm_bufio_issue_flush(struct dm_bufio_client *c)
.mem.type = DM_IO_KMEM,
.mem.ptr.addr = NULL,
.client = c-dm_io,
+   .submit_bio = 1,
};
struct dm_io_region io_reg = {
.bdev = c-bdev,
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index ea5dd28..f235182 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -287,8 +287,8 @@ static void km_dp_init(struct dpages *dp, void *data)
 /*-
  * IO routines that accept a list of pages.
  *---*/
-static void do_region(int rw, unsigned region, struct dm_io_region *where,
- struct dpages *dp, struct io *io)
+static void do_region(struct dm_io_request *io_req, unsigned region,
+   struct dm_io_region *where, struct dpages *dp, struct io *io)
 {
struct bio *bio;
struct page *page;
@@ -298,6 +298,7 @@ static void do_region(int rw, unsigned region, struct 
dm_io_region *where,
sector_t remaining = where-count;
struct request_queue *q = bdev_get_queue(where-bdev);
sector_t discard_sectors;
+   int rw = io_req-bi_rw;
 
/*
 * where-count may be zero if rw holds a flush and we need to
@@ -339,15 +340,26 @@ static void do_region(int rw, unsigned region, struct 
dm_io_region *where,
}
 
atomic_inc(io-count);
-   submit_bio(rw, bio);
+   if (io_req-submit_bio)
+   submit_bio(rw, bio);
+   else {
+   bio-bi_rw |= rw;
+   if (io_req-start) {
+   io_req-end-bi_next = bio;
+   io_req-end = bio;
+   } else
+   io_req-start = io_req-end = bio;
+   bio-bi_next = NULL;
+   }
} while (remaining);
 }
 
-static void dispatch_io(int rw, unsigned int num_regions,
+static void dispatch_io(struct dm_io_request *io_req, unsigned int num_regions,
struct dm_io_region *where, struct dpages *dp,
struct io *io, int sync)
 {
int i;
+   int rw = io_req-bi_rw;
struct dpages old_pages = *dp;
 
BUG_ON(num_regions  DM_IO_MAX_REGIONS);
@@ -362,7 +374,7 @@ static void dispatch_io(int rw, unsigned int num_regions,
for (i = 0; i  num_regions; i++) {
*dp = old_pages;
if (where[i].count || (rw  REQ_FLUSH))
-   do_region(rw, i, where + i, dp, io);
+   do_region(io_req, i, where + i, dp, io);
}
 
/*
@@ -372,8 +384,8 @@ static void dispatch_io(int rw, unsigned int num_regions,
dec_count(io, 0, 0);
 }
 
-static int sync_io(struct dm_io_client *client, unsigned int num_regions,
-  struct dm_io_region *where, int rw, struct dpages *dp,
+static int sync_io(struct dm_io_request *io_req,  unsigned int num_regions,
+  struct dm_io_region *where, struct dpages *dp,
   unsigned long *error_bits)
 {
/*
@@ -385,7 +397,7 @@ static int sync_io(struct dm_io_client *client, unsigned 
int num_regions,
volatile char io_[sizeof(struct io) + __alignof__(struct io) - 1];
struct io *io = (struct io *)PTR_ALIGN(io_, __alignof__(struct io));
 
-   if (num_regions  1  (rw  RW_MASK) != WRITE) {
+   if (num_regions  1  (io_req-bi_rw  RW_MASK) != WRITE) {
WARN_ON(1);
return -EIO;
}
@@ -393,12 +405,12 @@ static int sync_io