[PATCH 7/8] wbt: add general throttling mechanism

2016-09-07 Thread Jens Axboe
We can hook this up to the block layer, to help throttle buffered
writes. Or NFS can tap into it, to accomplish the same.

wbt registers a few trace points that can be used to track what is
happening in the system:

wbt_lat: 259:0: latency 2446318
wbt_stat: 259:0: rmean=2446318, rmin=2446318, rmax=2446318, rsamples=1,
   wmean=518866, wmin=15522, wmax=5330353, wsamples=57
wbt_step: 259:0: step down: step=1, window=72727272, background=8, normal=16, 
max=32

This shows a sync issue event (wbt_lat) that exceeded it's time. wbt_stat
dumps the current read/write stats for that window, and wbt_step shows a
step down event where we now scale back writes. Each trace includes the
device, 259:0 in this case.

Signed-off-by: Jens Axboe 
---
 include/linux/wbt.h| 120 
 include/trace/events/wbt.h | 153 ++
 lib/Kconfig|   3 +
 lib/Makefile   |   1 +
 lib/wbt.c  | 679 +
 5 files changed, 956 insertions(+)
 create mode 100644 include/linux/wbt.h
 create mode 100644 include/trace/events/wbt.h
 create mode 100644 lib/wbt.c

diff --git a/include/linux/wbt.h b/include/linux/wbt.h
new file mode 100644
index ..5ffcd1409c2f
--- /dev/null
+++ b/include/linux/wbt.h
@@ -0,0 +1,120 @@
+#ifndef WB_THROTTLE_H
+#define WB_THROTTLE_H
+
+#include 
+#include 
+#include 
+#include 
+
+enum {
+   ISSUE_STAT_TRACKED  = 1ULL << 63,
+   ISSUE_STAT_READ = 1ULL << 62,
+   ISSUE_STAT_MASK = ISSUE_STAT_TRACKED | ISSUE_STAT_READ,
+   ISSUE_STAT_TIME_MASK= ~ISSUE_STAT_MASK,
+
+   WBT_TRACKED = 1,
+   WBT_READ= 2,
+};
+
+struct wb_issue_stat {
+   u64 time;
+};
+
+static inline void wbt_issue_stat_set_time(struct wb_issue_stat *stat)
+{
+   stat->time = (stat->time & ISSUE_STAT_MASK) |
+   (ktime_to_ns(ktime_get()) & ISSUE_STAT_TIME_MASK);
+}
+
+static inline u64 wbt_issue_stat_get_time(struct wb_issue_stat *stat)
+{
+   return stat->time & ISSUE_STAT_TIME_MASK;
+}
+
+static inline void wbt_mark_tracked(struct wb_issue_stat *stat)
+{
+   stat->time |= ISSUE_STAT_TRACKED;
+}
+
+static inline void wbt_clear_state(struct wb_issue_stat *stat)
+{
+   stat->time &= ~(ISSUE_STAT_TRACKED | ISSUE_STAT_READ);
+}
+
+static inline bool wbt_tracked(struct wb_issue_stat *stat)
+{
+   return (stat->time & ISSUE_STAT_TRACKED) != 0;
+}
+
+static inline void wbt_mark_read(struct wb_issue_stat *stat)
+{
+   stat->time |= ISSUE_STAT_READ;
+}
+
+static inline bool wbt_is_read(struct wb_issue_stat *stat)
+{
+   return (stat->time & ISSUE_STAT_READ) != 0;
+}
+
+struct wb_stat_ops {
+   void (*get)(void *, struct blk_rq_stat *);
+   bool (*is_current)(struct blk_rq_stat *);
+   void (*clear)(void *);
+};
+
+struct rq_wb {
+   /*
+* Settings that govern how we throttle
+*/
+   unsigned int wb_background; /* background writeback */
+   unsigned int wb_normal; /* normal writeback */
+   unsigned int wb_max;/* max throughput writeback */
+   int scale_step;
+   bool scaled_max;
+
+   u64 win_nsec;   /* default window size */
+   u64 cur_win_nsec;   /* current window size */
+
+   /*
+* Number of consecutive periods where we don't have enough
+* information to make a firm scale up/down decision.
+*/
+   unsigned int unknown_cnt;
+
+   struct timer_list window_timer;
+
+   s64 sync_issue;
+   void *sync_cookie;
+
+   unsigned int wc;
+   unsigned int queue_depth;
+
+   unsigned long last_issue;   /* last non-throttled issue */
+   unsigned long last_comp;/* last non-throttled comp */
+   unsigned long min_lat_nsec;
+   struct backing_dev_info *bdi;
+   struct request_queue *q;
+   wait_queue_head_t wait;
+   atomic_t inflight;
+
+   struct wb_stat_ops *stat_ops;
+   void *ops_data;
+};
+
+struct backing_dev_info;
+
+void __wbt_done(struct rq_wb *);
+void wbt_done(struct rq_wb *, struct wb_issue_stat *);
+unsigned int wbt_wait(struct rq_wb *, unsigned int, spinlock_t *);
+struct rq_wb *wbt_init(struct backing_dev_info *, struct wb_stat_ops *, void 
*);
+void wbt_exit(struct rq_wb *);
+void wbt_update_limits(struct rq_wb *);
+void wbt_requeue(struct rq_wb *, struct wb_issue_stat *);
+void wbt_issue(struct rq_wb *, struct wb_issue_stat *);
+void wbt_disable(struct rq_wb *);
+void wbt_track(struct wb_issue_stat *, unsigned int);
+
+void wbt_set_queue_depth(struct rq_wb *, unsigned int);
+void wbt_set_write_cache(struct rq_wb *, bool);
+
+#endif
diff --git a/include/trace/events/wbt.h b/include/trace/events/wbt.h
new file mode 100644
index ..926c7ee0ef4e
--- /dev/null
+++ b/include/trace/events/wbt.h
@@ -0,0 +1,153 @@
+#undef 

[PATCH 7/8] wbt: add general throttling mechanism

2016-09-07 Thread Jens Axboe
We can hook this up to the block layer, to help throttle buffered
writes. Or NFS can tap into it, to accomplish the same.

wbt registers a few trace points that can be used to track what is
happening in the system:

wbt_lat: 259:0: latency 2446318
wbt_stat: 259:0: rmean=2446318, rmin=2446318, rmax=2446318, rsamples=1,
   wmean=518866, wmin=15522, wmax=5330353, wsamples=57
wbt_step: 259:0: step down: step=1, window=72727272, background=8, normal=16, 
max=32

This shows a sync issue event (wbt_lat) that exceeded it's time. wbt_stat
dumps the current read/write stats for that window, and wbt_step shows a
step down event where we now scale back writes. Each trace includes the
device, 259:0 in this case.

Signed-off-by: Jens Axboe 
---
 include/linux/wbt.h| 120 
 include/trace/events/wbt.h | 153 ++
 lib/Kconfig|   3 +
 lib/Makefile   |   1 +
 lib/wbt.c  | 679 +
 5 files changed, 956 insertions(+)
 create mode 100644 include/linux/wbt.h
 create mode 100644 include/trace/events/wbt.h
 create mode 100644 lib/wbt.c

diff --git a/include/linux/wbt.h b/include/linux/wbt.h
new file mode 100644
index ..5ffcd1409c2f
--- /dev/null
+++ b/include/linux/wbt.h
@@ -0,0 +1,120 @@
+#ifndef WB_THROTTLE_H
+#define WB_THROTTLE_H
+
+#include 
+#include 
+#include 
+#include 
+
+enum {
+   ISSUE_STAT_TRACKED  = 1ULL << 63,
+   ISSUE_STAT_READ = 1ULL << 62,
+   ISSUE_STAT_MASK = ISSUE_STAT_TRACKED | ISSUE_STAT_READ,
+   ISSUE_STAT_TIME_MASK= ~ISSUE_STAT_MASK,
+
+   WBT_TRACKED = 1,
+   WBT_READ= 2,
+};
+
+struct wb_issue_stat {
+   u64 time;
+};
+
+static inline void wbt_issue_stat_set_time(struct wb_issue_stat *stat)
+{
+   stat->time = (stat->time & ISSUE_STAT_MASK) |
+   (ktime_to_ns(ktime_get()) & ISSUE_STAT_TIME_MASK);
+}
+
+static inline u64 wbt_issue_stat_get_time(struct wb_issue_stat *stat)
+{
+   return stat->time & ISSUE_STAT_TIME_MASK;
+}
+
+static inline void wbt_mark_tracked(struct wb_issue_stat *stat)
+{
+   stat->time |= ISSUE_STAT_TRACKED;
+}
+
+static inline void wbt_clear_state(struct wb_issue_stat *stat)
+{
+   stat->time &= ~(ISSUE_STAT_TRACKED | ISSUE_STAT_READ);
+}
+
+static inline bool wbt_tracked(struct wb_issue_stat *stat)
+{
+   return (stat->time & ISSUE_STAT_TRACKED) != 0;
+}
+
+static inline void wbt_mark_read(struct wb_issue_stat *stat)
+{
+   stat->time |= ISSUE_STAT_READ;
+}
+
+static inline bool wbt_is_read(struct wb_issue_stat *stat)
+{
+   return (stat->time & ISSUE_STAT_READ) != 0;
+}
+
+struct wb_stat_ops {
+   void (*get)(void *, struct blk_rq_stat *);
+   bool (*is_current)(struct blk_rq_stat *);
+   void (*clear)(void *);
+};
+
+struct rq_wb {
+   /*
+* Settings that govern how we throttle
+*/
+   unsigned int wb_background; /* background writeback */
+   unsigned int wb_normal; /* normal writeback */
+   unsigned int wb_max;/* max throughput writeback */
+   int scale_step;
+   bool scaled_max;
+
+   u64 win_nsec;   /* default window size */
+   u64 cur_win_nsec;   /* current window size */
+
+   /*
+* Number of consecutive periods where we don't have enough
+* information to make a firm scale up/down decision.
+*/
+   unsigned int unknown_cnt;
+
+   struct timer_list window_timer;
+
+   s64 sync_issue;
+   void *sync_cookie;
+
+   unsigned int wc;
+   unsigned int queue_depth;
+
+   unsigned long last_issue;   /* last non-throttled issue */
+   unsigned long last_comp;/* last non-throttled comp */
+   unsigned long min_lat_nsec;
+   struct backing_dev_info *bdi;
+   struct request_queue *q;
+   wait_queue_head_t wait;
+   atomic_t inflight;
+
+   struct wb_stat_ops *stat_ops;
+   void *ops_data;
+};
+
+struct backing_dev_info;
+
+void __wbt_done(struct rq_wb *);
+void wbt_done(struct rq_wb *, struct wb_issue_stat *);
+unsigned int wbt_wait(struct rq_wb *, unsigned int, spinlock_t *);
+struct rq_wb *wbt_init(struct backing_dev_info *, struct wb_stat_ops *, void 
*);
+void wbt_exit(struct rq_wb *);
+void wbt_update_limits(struct rq_wb *);
+void wbt_requeue(struct rq_wb *, struct wb_issue_stat *);
+void wbt_issue(struct rq_wb *, struct wb_issue_stat *);
+void wbt_disable(struct rq_wb *);
+void wbt_track(struct wb_issue_stat *, unsigned int);
+
+void wbt_set_queue_depth(struct rq_wb *, unsigned int);
+void wbt_set_write_cache(struct rq_wb *, bool);
+
+#endif
diff --git a/include/trace/events/wbt.h b/include/trace/events/wbt.h
new file mode 100644
index ..926c7ee0ef4e
--- /dev/null
+++ b/include/trace/events/wbt.h
@@ -0,0 +1,153 @@
+#undef TRACE_SYSTEM
+#define 

Re: [PATCH 7/8] wbt: add general throttling mechanism

2016-09-01 Thread Jens Axboe

On 09/01/2016 12:05 PM, Omar Sandoval wrote:

diff --git a/lib/Kconfig b/lib/Kconfig
index d79909dc01ec..5a65a1f91889 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -550,4 +550,8 @@ config STACKDEPOT
bool
select STACKTRACE

+config WBT
+   bool
+   select SCALE_BITMAP


Looks like this snuck in from your experiments to get this to work on
top of scale_bitmap?


Oops yes, it is indeed. Killed, thanks.


+   if (waitqueue_active(>wait)) {
+   int diff = limit - inflight;
+
+   if (!inflight || diff >= rwb->wb_background / 2)
+   wake_up_nr(>wait, 1);


wake_up(>wait)?


Yeah, that'd be cleaner. I think this is a leftover from when I 
experimented with batched wakeups, with nr != 1. I'll change it to just 
wake_up().


--
Jens Axboe




Re: [PATCH 7/8] wbt: add general throttling mechanism

2016-09-01 Thread Jens Axboe

On 09/01/2016 12:05 PM, Omar Sandoval wrote:

diff --git a/lib/Kconfig b/lib/Kconfig
index d79909dc01ec..5a65a1f91889 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -550,4 +550,8 @@ config STACKDEPOT
bool
select STACKTRACE

+config WBT
+   bool
+   select SCALE_BITMAP


Looks like this snuck in from your experiments to get this to work on
top of scale_bitmap?


Oops yes, it is indeed. Killed, thanks.


+   if (waitqueue_active(>wait)) {
+   int diff = limit - inflight;
+
+   if (!inflight || diff >= rwb->wb_background / 2)
+   wake_up_nr(>wait, 1);


wake_up(>wait)?


Yeah, that'd be cleaner. I think this is a leftover from when I 
experimented with batched wakeups, with nr != 1. I'll change it to just 
wake_up().


--
Jens Axboe




Re: [PATCH 7/8] wbt: add general throttling mechanism

2016-09-01 Thread Omar Sandoval
On Wed, Aug 31, 2016 at 11:05:50AM -0600, Jens Axboe wrote:
> We can hook this up to the block layer, to help throttle buffered
> writes. Or NFS can tap into it, to accomplish the same.
> 
> wbt registers a few trace points that can be used to track what is
> happening in the system:
> 
> wbt_lat: 259:0: latency 2446318
> wbt_stat: 259:0: rmean=2446318, rmin=2446318, rmax=2446318, rsamples=1,
>wmean=518866, wmin=15522, wmax=5330353, wsamples=57
> wbt_step: 259:0: step down: step=1, window=72727272, background=8, normal=16, 
> max=32
> 
> This shows a sync issue event (wbt_lat) that exceeded it's time. wbt_stat
> dumps the current read/write stats for that window, and wbt_step shows a
> step down event where we now scale back writes. Each trace includes the
> device, 259:0 in this case.
> 
> Signed-off-by: Jens Axboe 
> ---
>  include/linux/wbt.h| 118 +
>  include/trace/events/wbt.h | 122 ++
>  lib/Kconfig|   4 +
>  lib/Makefile   |   1 +
>  lib/wbt.c  | 587 
> +
>  5 files changed, 832 insertions(+)
>  create mode 100644 include/linux/wbt.h
>  create mode 100644 include/trace/events/wbt.h
>  create mode 100644 lib/wbt.c
> 

[snip]

> diff --git a/lib/Kconfig b/lib/Kconfig
> index d79909dc01ec..5a65a1f91889 100644
> --- a/lib/Kconfig
> +++ b/lib/Kconfig
> @@ -550,4 +550,8 @@ config STACKDEPOT
>   bool
>   select STACKTRACE
>  
> +config WBT
> + bool
> + select SCALE_BITMAP

Looks like this snuck in from your experiments to get this to work on
top of scale_bitmap?

[snip]

> +void __wbt_done(struct rq_wb *rwb)
> +{
> + int inflight, limit;
> +
> + inflight = atomic_dec_return(>inflight);
> +
> + /*
> +  * wbt got disabled with IO in flight. Wake up any potential
> +  * waiters, we don't have to do more than that.
> +  */
> + if (unlikely(!rwb_enabled(rwb))) {
> + wake_up_all(>wait);
> + return;
> + }
> +
> + /*
> +  * If the device does write back caching, drop further down
> +  * before we wake people up.
> +  */
> + if (rwb->wc && !atomic_read(>bdi->wb.dirty_sleeping))
> + limit = 0;
> + else
> + limit = rwb->wb_normal;
> +
> + /*
> +  * Don't wake anyone up if we are above the normal limit.
> +  */
> + if (inflight && inflight >= limit)
> + return;
> +
> + if (waitqueue_active(>wait)) {
> + int diff = limit - inflight;
> +
> + if (!inflight || diff >= rwb->wb_background / 2)
> + wake_up_nr(>wait, 1);

wake_up(>wait)?

-- 
Omar


Re: [PATCH 7/8] wbt: add general throttling mechanism

2016-09-01 Thread Omar Sandoval
On Wed, Aug 31, 2016 at 11:05:50AM -0600, Jens Axboe wrote:
> We can hook this up to the block layer, to help throttle buffered
> writes. Or NFS can tap into it, to accomplish the same.
> 
> wbt registers a few trace points that can be used to track what is
> happening in the system:
> 
> wbt_lat: 259:0: latency 2446318
> wbt_stat: 259:0: rmean=2446318, rmin=2446318, rmax=2446318, rsamples=1,
>wmean=518866, wmin=15522, wmax=5330353, wsamples=57
> wbt_step: 259:0: step down: step=1, window=72727272, background=8, normal=16, 
> max=32
> 
> This shows a sync issue event (wbt_lat) that exceeded it's time. wbt_stat
> dumps the current read/write stats for that window, and wbt_step shows a
> step down event where we now scale back writes. Each trace includes the
> device, 259:0 in this case.
> 
> Signed-off-by: Jens Axboe 
> ---
>  include/linux/wbt.h| 118 +
>  include/trace/events/wbt.h | 122 ++
>  lib/Kconfig|   4 +
>  lib/Makefile   |   1 +
>  lib/wbt.c  | 587 
> +
>  5 files changed, 832 insertions(+)
>  create mode 100644 include/linux/wbt.h
>  create mode 100644 include/trace/events/wbt.h
>  create mode 100644 lib/wbt.c
> 

[snip]

> diff --git a/lib/Kconfig b/lib/Kconfig
> index d79909dc01ec..5a65a1f91889 100644
> --- a/lib/Kconfig
> +++ b/lib/Kconfig
> @@ -550,4 +550,8 @@ config STACKDEPOT
>   bool
>   select STACKTRACE
>  
> +config WBT
> + bool
> + select SCALE_BITMAP

Looks like this snuck in from your experiments to get this to work on
top of scale_bitmap?

[snip]

> +void __wbt_done(struct rq_wb *rwb)
> +{
> + int inflight, limit;
> +
> + inflight = atomic_dec_return(>inflight);
> +
> + /*
> +  * wbt got disabled with IO in flight. Wake up any potential
> +  * waiters, we don't have to do more than that.
> +  */
> + if (unlikely(!rwb_enabled(rwb))) {
> + wake_up_all(>wait);
> + return;
> + }
> +
> + /*
> +  * If the device does write back caching, drop further down
> +  * before we wake people up.
> +  */
> + if (rwb->wc && !atomic_read(>bdi->wb.dirty_sleeping))
> + limit = 0;
> + else
> + limit = rwb->wb_normal;
> +
> + /*
> +  * Don't wake anyone up if we are above the normal limit.
> +  */
> + if (inflight && inflight >= limit)
> + return;
> +
> + if (waitqueue_active(>wait)) {
> + int diff = limit - inflight;
> +
> + if (!inflight || diff >= rwb->wb_background / 2)
> + wake_up_nr(>wait, 1);

wake_up(>wait)?

-- 
Omar


[PATCH 7/8] wbt: add general throttling mechanism

2016-08-31 Thread Jens Axboe
We can hook this up to the block layer, to help throttle buffered
writes. Or NFS can tap into it, to accomplish the same.

wbt registers a few trace points that can be used to track what is
happening in the system:

wbt_lat: 259:0: latency 2446318
wbt_stat: 259:0: rmean=2446318, rmin=2446318, rmax=2446318, rsamples=1,
   wmean=518866, wmin=15522, wmax=5330353, wsamples=57
wbt_step: 259:0: step down: step=1, window=72727272, background=8, normal=16, 
max=32

This shows a sync issue event (wbt_lat) that exceeded it's time. wbt_stat
dumps the current read/write stats for that window, and wbt_step shows a
step down event where we now scale back writes. Each trace includes the
device, 259:0 in this case.

Signed-off-by: Jens Axboe 
---
 include/linux/wbt.h| 118 +
 include/trace/events/wbt.h | 122 ++
 lib/Kconfig|   4 +
 lib/Makefile   |   1 +
 lib/wbt.c  | 587 +
 5 files changed, 832 insertions(+)
 create mode 100644 include/linux/wbt.h
 create mode 100644 include/trace/events/wbt.h
 create mode 100644 lib/wbt.c

diff --git a/include/linux/wbt.h b/include/linux/wbt.h
new file mode 100644
index ..14473d550a18
--- /dev/null
+++ b/include/linux/wbt.h
@@ -0,0 +1,118 @@
+#ifndef WB_THROTTLE_H
+#define WB_THROTTLE_H
+
+#include 
+#include 
+#include 
+#include 
+
+enum {
+   ISSUE_STAT_TRACKED  = 1ULL << 63,
+   ISSUE_STAT_READ = 1ULL << 62,
+   ISSUE_STAT_MASK = ISSUE_STAT_TRACKED | ISSUE_STAT_READ,
+   ISSUE_STAT_TIME_MASK= ~ISSUE_STAT_MASK,
+
+   WBT_TRACKED = 1,
+   WBT_READ= 2,
+};
+
+struct wb_issue_stat {
+   u64 time;
+};
+
+static inline void wbt_issue_stat_set_time(struct wb_issue_stat *stat)
+{
+   stat->time = (stat->time & ISSUE_STAT_MASK) |
+   (ktime_to_ns(ktime_get()) & ISSUE_STAT_TIME_MASK);
+}
+
+static inline u64 wbt_issue_stat_get_time(struct wb_issue_stat *stat)
+{
+   return stat->time & ISSUE_STAT_TIME_MASK;
+}
+
+static inline void wbt_mark_tracked(struct wb_issue_stat *stat)
+{
+   stat->time |= ISSUE_STAT_TRACKED;
+}
+
+static inline void wbt_clear_state(struct wb_issue_stat *stat)
+{
+   stat->time &= ~(ISSUE_STAT_TRACKED | ISSUE_STAT_READ);
+}
+
+static inline bool wbt_tracked(struct wb_issue_stat *stat)
+{
+   return (stat->time & ISSUE_STAT_TRACKED) != 0;
+}
+
+static inline void wbt_mark_read(struct wb_issue_stat *stat)
+{
+   stat->time |= ISSUE_STAT_READ;
+}
+
+static inline bool wbt_is_read(struct wb_issue_stat *stat)
+{
+   return (stat->time & ISSUE_STAT_READ) != 0;
+}
+
+struct wb_stat_ops {
+   void (*get)(void *, struct blk_rq_stat *);
+   void (*clear)(void *);
+};
+
+struct rq_wb {
+   /*
+* Settings that govern how we throttle
+*/
+   unsigned int wb_background; /* background writeback */
+   unsigned int wb_normal; /* normal writeback */
+   unsigned int wb_max;/* max throughput writeback */
+   unsigned int scale_step;
+
+   u64 win_nsec;   /* default window size */
+   u64 cur_win_nsec;   /* current window size */
+
+   /*
+* Number of consecutive periods where we don't have enough
+* information to make a firm scale up/down decision.
+*/
+   unsigned int unknown_cnt;
+
+   struct timer_list window_timer;
+
+   s64 sync_issue;
+   void *sync_cookie;
+
+   unsigned int wc;
+   unsigned int queue_depth;
+
+   unsigned long last_issue;   /* last non-throttled issue */
+   unsigned long last_comp;/* last non-throttled comp */
+   unsigned long min_lat_nsec;
+   struct backing_dev_info *bdi;
+   struct request_queue *q;
+   wait_queue_head_t wait;
+   atomic_t inflight;
+
+   struct wb_stat_ops *stat_ops;
+   void *ops_data;
+};
+
+struct backing_dev_info;
+
+void __wbt_done(struct rq_wb *);
+void wbt_done(struct rq_wb *, struct wb_issue_stat *);
+unsigned int wbt_wait(struct rq_wb *, unsigned int, spinlock_t *);
+struct rq_wb *wbt_init(struct backing_dev_info *, struct wb_stat_ops *, void 
*);
+void wbt_exit(struct rq_wb *);
+void wbt_update_limits(struct rq_wb *);
+void wbt_requeue(struct rq_wb *, struct wb_issue_stat *);
+void wbt_issue(struct rq_wb *, struct wb_issue_stat *);
+void wbt_disable(struct rq_wb *);
+void wbt_track(struct wb_issue_stat *, unsigned int);
+
+void wbt_set_queue_depth(struct rq_wb *, unsigned int);
+void wbt_set_write_cache(struct rq_wb *, bool);
+
+#endif
diff --git a/include/trace/events/wbt.h b/include/trace/events/wbt.h
new file mode 100644
index ..a4b8b2e57bb1
--- /dev/null
+++ b/include/trace/events/wbt.h
@@ -0,0 +1,122 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM wbt
+
+#if !defined(_TRACE_WBT_H) || 

[PATCH 7/8] wbt: add general throttling mechanism

2016-08-31 Thread Jens Axboe
We can hook this up to the block layer, to help throttle buffered
writes. Or NFS can tap into it, to accomplish the same.

wbt registers a few trace points that can be used to track what is
happening in the system:

wbt_lat: 259:0: latency 2446318
wbt_stat: 259:0: rmean=2446318, rmin=2446318, rmax=2446318, rsamples=1,
   wmean=518866, wmin=15522, wmax=5330353, wsamples=57
wbt_step: 259:0: step down: step=1, window=72727272, background=8, normal=16, 
max=32

This shows a sync issue event (wbt_lat) that exceeded it's time. wbt_stat
dumps the current read/write stats for that window, and wbt_step shows a
step down event where we now scale back writes. Each trace includes the
device, 259:0 in this case.

Signed-off-by: Jens Axboe 
---
 include/linux/wbt.h| 118 +
 include/trace/events/wbt.h | 122 ++
 lib/Kconfig|   4 +
 lib/Makefile   |   1 +
 lib/wbt.c  | 587 +
 5 files changed, 832 insertions(+)
 create mode 100644 include/linux/wbt.h
 create mode 100644 include/trace/events/wbt.h
 create mode 100644 lib/wbt.c

diff --git a/include/linux/wbt.h b/include/linux/wbt.h
new file mode 100644
index ..14473d550a18
--- /dev/null
+++ b/include/linux/wbt.h
@@ -0,0 +1,118 @@
+#ifndef WB_THROTTLE_H
+#define WB_THROTTLE_H
+
+#include 
+#include 
+#include 
+#include 
+
+enum {
+   ISSUE_STAT_TRACKED  = 1ULL << 63,
+   ISSUE_STAT_READ = 1ULL << 62,
+   ISSUE_STAT_MASK = ISSUE_STAT_TRACKED | ISSUE_STAT_READ,
+   ISSUE_STAT_TIME_MASK= ~ISSUE_STAT_MASK,
+
+   WBT_TRACKED = 1,
+   WBT_READ= 2,
+};
+
+struct wb_issue_stat {
+   u64 time;
+};
+
+static inline void wbt_issue_stat_set_time(struct wb_issue_stat *stat)
+{
+   stat->time = (stat->time & ISSUE_STAT_MASK) |
+   (ktime_to_ns(ktime_get()) & ISSUE_STAT_TIME_MASK);
+}
+
+static inline u64 wbt_issue_stat_get_time(struct wb_issue_stat *stat)
+{
+   return stat->time & ISSUE_STAT_TIME_MASK;
+}
+
+static inline void wbt_mark_tracked(struct wb_issue_stat *stat)
+{
+   stat->time |= ISSUE_STAT_TRACKED;
+}
+
+static inline void wbt_clear_state(struct wb_issue_stat *stat)
+{
+   stat->time &= ~(ISSUE_STAT_TRACKED | ISSUE_STAT_READ);
+}
+
+static inline bool wbt_tracked(struct wb_issue_stat *stat)
+{
+   return (stat->time & ISSUE_STAT_TRACKED) != 0;
+}
+
+static inline void wbt_mark_read(struct wb_issue_stat *stat)
+{
+   stat->time |= ISSUE_STAT_READ;
+}
+
+static inline bool wbt_is_read(struct wb_issue_stat *stat)
+{
+   return (stat->time & ISSUE_STAT_READ) != 0;
+}
+
+struct wb_stat_ops {
+   void (*get)(void *, struct blk_rq_stat *);
+   void (*clear)(void *);
+};
+
+struct rq_wb {
+   /*
+* Settings that govern how we throttle
+*/
+   unsigned int wb_background; /* background writeback */
+   unsigned int wb_normal; /* normal writeback */
+   unsigned int wb_max;/* max throughput writeback */
+   unsigned int scale_step;
+
+   u64 win_nsec;   /* default window size */
+   u64 cur_win_nsec;   /* current window size */
+
+   /*
+* Number of consecutive periods where we don't have enough
+* information to make a firm scale up/down decision.
+*/
+   unsigned int unknown_cnt;
+
+   struct timer_list window_timer;
+
+   s64 sync_issue;
+   void *sync_cookie;
+
+   unsigned int wc;
+   unsigned int queue_depth;
+
+   unsigned long last_issue;   /* last non-throttled issue */
+   unsigned long last_comp;/* last non-throttled comp */
+   unsigned long min_lat_nsec;
+   struct backing_dev_info *bdi;
+   struct request_queue *q;
+   wait_queue_head_t wait;
+   atomic_t inflight;
+
+   struct wb_stat_ops *stat_ops;
+   void *ops_data;
+};
+
+struct backing_dev_info;
+
+void __wbt_done(struct rq_wb *);
+void wbt_done(struct rq_wb *, struct wb_issue_stat *);
+unsigned int wbt_wait(struct rq_wb *, unsigned int, spinlock_t *);
+struct rq_wb *wbt_init(struct backing_dev_info *, struct wb_stat_ops *, void 
*);
+void wbt_exit(struct rq_wb *);
+void wbt_update_limits(struct rq_wb *);
+void wbt_requeue(struct rq_wb *, struct wb_issue_stat *);
+void wbt_issue(struct rq_wb *, struct wb_issue_stat *);
+void wbt_disable(struct rq_wb *);
+void wbt_track(struct wb_issue_stat *, unsigned int);
+
+void wbt_set_queue_depth(struct rq_wb *, unsigned int);
+void wbt_set_write_cache(struct rq_wb *, bool);
+
+#endif
diff --git a/include/trace/events/wbt.h b/include/trace/events/wbt.h
new file mode 100644
index ..a4b8b2e57bb1
--- /dev/null
+++ b/include/trace/events/wbt.h
@@ -0,0 +1,122 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM wbt
+
+#if !defined(_TRACE_WBT_H) || 

Re: [PATCH 7/8] wbt: add general throttling mechanism

2016-05-03 Thread Jens Axboe

On 05/03/2016 12:14 PM, Jens Axboe wrote:

On 05/03/2016 10:59 AM, Jens Axboe wrote:

On 05/03/2016 09:48 AM, Jan Kara wrote:

On Tue 03-05-16 17:40:32, Jan Kara wrote:

On Tue 03-05-16 11:34:10, Jan Kara wrote:

Yeah, once I'll hunt down that regression with old disk, I can have
a look
into how writeback throttling plays together with blkio-controller.


So I've tried the following script (note that you need cgroup v2 for
writeback IO to be throttled):

---
mkdir /sys/fs/cgroup/group1
echo 1000 >/sys/fs/cgroup/group1/io.weight
dd if=/dev/zero of=/mnt/file1 bs=1M count=1&
DD1=$!
echo $DD1 >/sys/fs/cgroup/group1/cgroup.procs

mkdir /sys/fs/cgroup/group2
echo 100 >/sys/fs/cgroup/group2/io.weight
#echo "259:65536 wbps=500" >/sys/fs/cgroup/group2/io.max
echo "259:65536 wbps=max" >/sys/fs/cgroup/group2/io.max
dd if=/dev/zero of=/mnt/file2 bs=1M count=1&
DD2=$!
echo $DD2 >/sys/fs/cgroup/group2/cgroup.procs

while true; do
 sleep 1
 kill -USR1 $DD1
 kill -USR1 $DD2
 echo
'==='
done
---

and watched the progress of the dd processes in different cgroups.
The 1/10
weight difference has no effect with your writeback patches - the
situation
after one minute:

3120+1 records in
3120+1 records out
3272392704 bytes (3.3 GB) copied, 63.7119 s, 51.4 MB/s
3217+1 records in
3217+1 records out
3374010368 bytes (3.4 GB) copied, 63.5819 s, 53.1 MB/s

I should add that even without your patches the progress doesn't quite
correspond to the weight ratio:


Forgot to fill in corresponding data for unpatched kernel here:

5962+2 records in
5962+2 records out
6252281856 bytes (6.3 GB) copied, 64.1719 s, 97.4 MB/s
1502+0 records in
1502+0 records out
1574961152 bytes (1.6 GB) copied, 64.207 s, 24.5 MB/s


Thanks for testing this, I'll see what we can do about that. It stands
to reason that we'll throttle a heavier writer more, statistically. But
I'm assuming this above test was run basically with just the writes
going, so no real competition? And hence we end up throttling them
equally much, destroying the weighting in the process. But for both
cases, we basically don't pay any attention to cgroup weights.


but still there is noticeable difference to cgroups with different
weights.

OTOH blk-throttle combines well with your patches: Limiting one
cgroup to
5 M/s results in numbers like:

3883+2 records in
3883+2 records out
4072091648 bytes (4.1 GB) copied, 36.6713 s, 111 MB/s
413+0 records in
413+0 records out
433061888 bytes (433 MB) copied, 36.8939 s, 11.7 MB/s

which is fine and comparable with unpatched kernel. Higher throughput
number is because we do buffered writes and dd reports what it wrote
into
page cache. And there is no wonder blk-throttle combines fine - it
throttles bios which happens before we reach writeback throttling
mechanism.


OK, that's good, at least that part works fine. And yes, the throttle
path is hit before we end up in the make_request_fn, which is where wbt
drops in.


So I belive this demonstrates that your writeback throttling just
doesn't
work well with selective scheduling policy that happens below it
because it
can essentially lead to IO priority inversion issues...


It this testing still done on the QD=1 ATA disk? Not too surprising that
this falls apart, since we have very little room to maneuver. I wonder
if a normal SATA with NCQ would behave better in this regard. I'll have
to test a bit and think about how we can best handle this case.


I think what we'll do for now is just disable wbt IFF we have a non-root
cgroup attached to CFQ. Done here:

http://git.kernel.dk/cgit/linux-block/commit/?h=wb-buf-throttle=7315756efe76bbdf83076fc9dbc569bbb4da5d32


That was a bit too untested.. This should be better, it taps into where 
cfq normally notices a difference in blkcg:


http://git.kernel.dk/cgit/linux-block/commit/?h=wb-buf-throttle=9b89e1bb666bd036a4cb1313479435087fb86ba0


--
Jens Axboe



Re: [PATCH 7/8] wbt: add general throttling mechanism

2016-05-03 Thread Jens Axboe

On 05/03/2016 12:14 PM, Jens Axboe wrote:

On 05/03/2016 10:59 AM, Jens Axboe wrote:

On 05/03/2016 09:48 AM, Jan Kara wrote:

On Tue 03-05-16 17:40:32, Jan Kara wrote:

On Tue 03-05-16 11:34:10, Jan Kara wrote:

Yeah, once I'll hunt down that regression with old disk, I can have
a look
into how writeback throttling plays together with blkio-controller.


So I've tried the following script (note that you need cgroup v2 for
writeback IO to be throttled):

---
mkdir /sys/fs/cgroup/group1
echo 1000 >/sys/fs/cgroup/group1/io.weight
dd if=/dev/zero of=/mnt/file1 bs=1M count=1&
DD1=$!
echo $DD1 >/sys/fs/cgroup/group1/cgroup.procs

mkdir /sys/fs/cgroup/group2
echo 100 >/sys/fs/cgroup/group2/io.weight
#echo "259:65536 wbps=500" >/sys/fs/cgroup/group2/io.max
echo "259:65536 wbps=max" >/sys/fs/cgroup/group2/io.max
dd if=/dev/zero of=/mnt/file2 bs=1M count=1&
DD2=$!
echo $DD2 >/sys/fs/cgroup/group2/cgroup.procs

while true; do
 sleep 1
 kill -USR1 $DD1
 kill -USR1 $DD2
 echo
'==='
done
---

and watched the progress of the dd processes in different cgroups.
The 1/10
weight difference has no effect with your writeback patches - the
situation
after one minute:

3120+1 records in
3120+1 records out
3272392704 bytes (3.3 GB) copied, 63.7119 s, 51.4 MB/s
3217+1 records in
3217+1 records out
3374010368 bytes (3.4 GB) copied, 63.5819 s, 53.1 MB/s

I should add that even without your patches the progress doesn't quite
correspond to the weight ratio:


Forgot to fill in corresponding data for unpatched kernel here:

5962+2 records in
5962+2 records out
6252281856 bytes (6.3 GB) copied, 64.1719 s, 97.4 MB/s
1502+0 records in
1502+0 records out
1574961152 bytes (1.6 GB) copied, 64.207 s, 24.5 MB/s


Thanks for testing this, I'll see what we can do about that. It stands
to reason that we'll throttle a heavier writer more, statistically. But
I'm assuming this above test was run basically with just the writes
going, so no real competition? And hence we end up throttling them
equally much, destroying the weighting in the process. But for both
cases, we basically don't pay any attention to cgroup weights.


but still there is noticeable difference to cgroups with different
weights.

OTOH blk-throttle combines well with your patches: Limiting one
cgroup to
5 M/s results in numbers like:

3883+2 records in
3883+2 records out
4072091648 bytes (4.1 GB) copied, 36.6713 s, 111 MB/s
413+0 records in
413+0 records out
433061888 bytes (433 MB) copied, 36.8939 s, 11.7 MB/s

which is fine and comparable with unpatched kernel. Higher throughput
number is because we do buffered writes and dd reports what it wrote
into
page cache. And there is no wonder blk-throttle combines fine - it
throttles bios which happens before we reach writeback throttling
mechanism.


OK, that's good, at least that part works fine. And yes, the throttle
path is hit before we end up in the make_request_fn, which is where wbt
drops in.


So I belive this demonstrates that your writeback throttling just
doesn't
work well with selective scheduling policy that happens below it
because it
can essentially lead to IO priority inversion issues...


It this testing still done on the QD=1 ATA disk? Not too surprising that
this falls apart, since we have very little room to maneuver. I wonder
if a normal SATA with NCQ would behave better in this regard. I'll have
to test a bit and think about how we can best handle this case.


I think what we'll do for now is just disable wbt IFF we have a non-root
cgroup attached to CFQ. Done here:

http://git.kernel.dk/cgit/linux-block/commit/?h=wb-buf-throttle=7315756efe76bbdf83076fc9dbc569bbb4da5d32


That was a bit too untested.. This should be better, it taps into where 
cfq normally notices a difference in blkcg:


http://git.kernel.dk/cgit/linux-block/commit/?h=wb-buf-throttle=9b89e1bb666bd036a4cb1313479435087fb86ba0


--
Jens Axboe



Re: [PATCH 7/8] wbt: add general throttling mechanism

2016-05-03 Thread Jens Axboe

On 05/03/2016 10:59 AM, Jens Axboe wrote:

On 05/03/2016 09:48 AM, Jan Kara wrote:

On Tue 03-05-16 17:40:32, Jan Kara wrote:

On Tue 03-05-16 11:34:10, Jan Kara wrote:

Yeah, once I'll hunt down that regression with old disk, I can have
a look
into how writeback throttling plays together with blkio-controller.


So I've tried the following script (note that you need cgroup v2 for
writeback IO to be throttled):

---
mkdir /sys/fs/cgroup/group1
echo 1000 >/sys/fs/cgroup/group1/io.weight
dd if=/dev/zero of=/mnt/file1 bs=1M count=1&
DD1=$!
echo $DD1 >/sys/fs/cgroup/group1/cgroup.procs

mkdir /sys/fs/cgroup/group2
echo 100 >/sys/fs/cgroup/group2/io.weight
#echo "259:65536 wbps=500" >/sys/fs/cgroup/group2/io.max
echo "259:65536 wbps=max" >/sys/fs/cgroup/group2/io.max
dd if=/dev/zero of=/mnt/file2 bs=1M count=1&
DD2=$!
echo $DD2 >/sys/fs/cgroup/group2/cgroup.procs

while true; do
 sleep 1
 kill -USR1 $DD1
 kill -USR1 $DD2
 echo  '==='
done
---

and watched the progress of the dd processes in different cgroups.
The 1/10
weight difference has no effect with your writeback patches - the
situation
after one minute:

3120+1 records in
3120+1 records out
3272392704 bytes (3.3 GB) copied, 63.7119 s, 51.4 MB/s
3217+1 records in
3217+1 records out
3374010368 bytes (3.4 GB) copied, 63.5819 s, 53.1 MB/s

I should add that even without your patches the progress doesn't quite
correspond to the weight ratio:


Forgot to fill in corresponding data for unpatched kernel here:

5962+2 records in
5962+2 records out
6252281856 bytes (6.3 GB) copied, 64.1719 s, 97.4 MB/s
1502+0 records in
1502+0 records out
1574961152 bytes (1.6 GB) copied, 64.207 s, 24.5 MB/s


Thanks for testing this, I'll see what we can do about that. It stands
to reason that we'll throttle a heavier writer more, statistically. But
I'm assuming this above test was run basically with just the writes
going, so no real competition? And hence we end up throttling them
equally much, destroying the weighting in the process. But for both
cases, we basically don't pay any attention to cgroup weights.


but still there is noticeable difference to cgroups with different
weights.

OTOH blk-throttle combines well with your patches: Limiting one
cgroup to
5 M/s results in numbers like:

3883+2 records in
3883+2 records out
4072091648 bytes (4.1 GB) copied, 36.6713 s, 111 MB/s
413+0 records in
413+0 records out
433061888 bytes (433 MB) copied, 36.8939 s, 11.7 MB/s

which is fine and comparable with unpatched kernel. Higher throughput
number is because we do buffered writes and dd reports what it wrote
into
page cache. And there is no wonder blk-throttle combines fine - it
throttles bios which happens before we reach writeback throttling
mechanism.


OK, that's good, at least that part works fine. And yes, the throttle
path is hit before we end up in the make_request_fn, which is where wbt
drops in.


So I belive this demonstrates that your writeback throttling just
doesn't
work well with selective scheduling policy that happens below it
because it
can essentially lead to IO priority inversion issues...


It this testing still done on the QD=1 ATA disk? Not too surprising that
this falls apart, since we have very little room to maneuver. I wonder
if a normal SATA with NCQ would behave better in this regard. I'll have
to test a bit and think about how we can best handle this case.


I think what we'll do for now is just disable wbt IFF we have a non-root 
cgroup attached to CFQ. Done here:


http://git.kernel.dk/cgit/linux-block/commit/?h=wb-buf-throttle=7315756efe76bbdf83076fc9dbc569bbb4da5d32

We don't have a strong need for wbt (supposedly) since CFQ should take 
care of most of it, if you have policies set for proportional sharing.


Longer term it's not a concern either, as we'll move away from that 
model anyway.


--
Jens Axboe



Re: [PATCH 7/8] wbt: add general throttling mechanism

2016-05-03 Thread Jens Axboe

On 05/03/2016 10:59 AM, Jens Axboe wrote:

On 05/03/2016 09:48 AM, Jan Kara wrote:

On Tue 03-05-16 17:40:32, Jan Kara wrote:

On Tue 03-05-16 11:34:10, Jan Kara wrote:

Yeah, once I'll hunt down that regression with old disk, I can have
a look
into how writeback throttling plays together with blkio-controller.


So I've tried the following script (note that you need cgroup v2 for
writeback IO to be throttled):

---
mkdir /sys/fs/cgroup/group1
echo 1000 >/sys/fs/cgroup/group1/io.weight
dd if=/dev/zero of=/mnt/file1 bs=1M count=1&
DD1=$!
echo $DD1 >/sys/fs/cgroup/group1/cgroup.procs

mkdir /sys/fs/cgroup/group2
echo 100 >/sys/fs/cgroup/group2/io.weight
#echo "259:65536 wbps=500" >/sys/fs/cgroup/group2/io.max
echo "259:65536 wbps=max" >/sys/fs/cgroup/group2/io.max
dd if=/dev/zero of=/mnt/file2 bs=1M count=1&
DD2=$!
echo $DD2 >/sys/fs/cgroup/group2/cgroup.procs

while true; do
 sleep 1
 kill -USR1 $DD1
 kill -USR1 $DD2
 echo  '==='
done
---

and watched the progress of the dd processes in different cgroups.
The 1/10
weight difference has no effect with your writeback patches - the
situation
after one minute:

3120+1 records in
3120+1 records out
3272392704 bytes (3.3 GB) copied, 63.7119 s, 51.4 MB/s
3217+1 records in
3217+1 records out
3374010368 bytes (3.4 GB) copied, 63.5819 s, 53.1 MB/s

I should add that even without your patches the progress doesn't quite
correspond to the weight ratio:


Forgot to fill in corresponding data for unpatched kernel here:

5962+2 records in
5962+2 records out
6252281856 bytes (6.3 GB) copied, 64.1719 s, 97.4 MB/s
1502+0 records in
1502+0 records out
1574961152 bytes (1.6 GB) copied, 64.207 s, 24.5 MB/s


Thanks for testing this, I'll see what we can do about that. It stands
to reason that we'll throttle a heavier writer more, statistically. But
I'm assuming this above test was run basically with just the writes
going, so no real competition? And hence we end up throttling them
equally much, destroying the weighting in the process. But for both
cases, we basically don't pay any attention to cgroup weights.


but still there is noticeable difference to cgroups with different
weights.

OTOH blk-throttle combines well with your patches: Limiting one
cgroup to
5 M/s results in numbers like:

3883+2 records in
3883+2 records out
4072091648 bytes (4.1 GB) copied, 36.6713 s, 111 MB/s
413+0 records in
413+0 records out
433061888 bytes (433 MB) copied, 36.8939 s, 11.7 MB/s

which is fine and comparable with unpatched kernel. Higher throughput
number is because we do buffered writes and dd reports what it wrote
into
page cache. And there is no wonder blk-throttle combines fine - it
throttles bios which happens before we reach writeback throttling
mechanism.


OK, that's good, at least that part works fine. And yes, the throttle
path is hit before we end up in the make_request_fn, which is where wbt
drops in.


So I belive this demonstrates that your writeback throttling just
doesn't
work well with selective scheduling policy that happens below it
because it
can essentially lead to IO priority inversion issues...


It this testing still done on the QD=1 ATA disk? Not too surprising that
this falls apart, since we have very little room to maneuver. I wonder
if a normal SATA with NCQ would behave better in this regard. I'll have
to test a bit and think about how we can best handle this case.


I think what we'll do for now is just disable wbt IFF we have a non-root 
cgroup attached to CFQ. Done here:


http://git.kernel.dk/cgit/linux-block/commit/?h=wb-buf-throttle=7315756efe76bbdf83076fc9dbc569bbb4da5d32

We don't have a strong need for wbt (supposedly) since CFQ should take 
care of most of it, if you have policies set for proportional sharing.


Longer term it's not a concern either, as we'll move away from that 
model anyway.


--
Jens Axboe



Re: [PATCH 7/8] wbt: add general throttling mechanism

2016-05-03 Thread Jens Axboe

On 05/03/2016 09:48 AM, Jan Kara wrote:

On Tue 03-05-16 17:40:32, Jan Kara wrote:

On Tue 03-05-16 11:34:10, Jan Kara wrote:

Yeah, once I'll hunt down that regression with old disk, I can have a look
into how writeback throttling plays together with blkio-controller.


So I've tried the following script (note that you need cgroup v2 for
writeback IO to be throttled):

---
mkdir /sys/fs/cgroup/group1
echo 1000 >/sys/fs/cgroup/group1/io.weight
dd if=/dev/zero of=/mnt/file1 bs=1M count=1&
DD1=$!
echo $DD1 >/sys/fs/cgroup/group1/cgroup.procs

mkdir /sys/fs/cgroup/group2
echo 100 >/sys/fs/cgroup/group2/io.weight
#echo "259:65536 wbps=500" >/sys/fs/cgroup/group2/io.max
echo "259:65536 wbps=max" >/sys/fs/cgroup/group2/io.max
dd if=/dev/zero of=/mnt/file2 bs=1M count=1&
DD2=$!
echo $DD2 >/sys/fs/cgroup/group2/cgroup.procs

while true; do
 sleep 1
 kill -USR1 $DD1
 kill -USR1 $DD2
 echo  '==='
done
---

and watched the progress of the dd processes in different cgroups. The 1/10
weight difference has no effect with your writeback patches - the situation
after one minute:

3120+1 records in
3120+1 records out
3272392704 bytes (3.3 GB) copied, 63.7119 s, 51.4 MB/s
3217+1 records in
3217+1 records out
3374010368 bytes (3.4 GB) copied, 63.5819 s, 53.1 MB/s

I should add that even without your patches the progress doesn't quite
correspond to the weight ratio:


Forgot to fill in corresponding data for unpatched kernel here:

5962+2 records in
5962+2 records out
6252281856 bytes (6.3 GB) copied, 64.1719 s, 97.4 MB/s
1502+0 records in
1502+0 records out
1574961152 bytes (1.6 GB) copied, 64.207 s, 24.5 MB/s


Thanks for testing this, I'll see what we can do about that. It stands 
to reason that we'll throttle a heavier writer more, statistically. But 
I'm assuming this above test was run basically with just the writes 
going, so no real competition? And hence we end up throttling them 
equally much, destroying the weighting in the process. But for both 
cases, we basically don't pay any attention to cgroup weights.



but still there is noticeable difference to cgroups with different weights.

OTOH blk-throttle combines well with your patches: Limiting one cgroup to
5 M/s results in numbers like:

3883+2 records in
3883+2 records out
4072091648 bytes (4.1 GB) copied, 36.6713 s, 111 MB/s
413+0 records in
413+0 records out
433061888 bytes (433 MB) copied, 36.8939 s, 11.7 MB/s

which is fine and comparable with unpatched kernel. Higher throughput
number is because we do buffered writes and dd reports what it wrote into
page cache. And there is no wonder blk-throttle combines fine - it
throttles bios which happens before we reach writeback throttling
mechanism.


OK, that's good, at least that part works fine. And yes, the throttle 
path is hit before we end up in the make_request_fn, which is where wbt 
drops in.



So I belive this demonstrates that your writeback throttling just doesn't
work well with selective scheduling policy that happens below it because it
can essentially lead to IO priority inversion issues...


It this testing still done on the QD=1 ATA disk? Not too surprising that 
this falls apart, since we have very little room to maneuver. I wonder 
if a normal SATA with NCQ would behave better in this regard. I'll have 
to test a bit and think about how we can best handle this case.


--
Jens Axboe



Re: [PATCH 7/8] wbt: add general throttling mechanism

2016-05-03 Thread Jens Axboe

On 05/03/2016 09:48 AM, Jan Kara wrote:

On Tue 03-05-16 17:40:32, Jan Kara wrote:

On Tue 03-05-16 11:34:10, Jan Kara wrote:

Yeah, once I'll hunt down that regression with old disk, I can have a look
into how writeback throttling plays together with blkio-controller.


So I've tried the following script (note that you need cgroup v2 for
writeback IO to be throttled):

---
mkdir /sys/fs/cgroup/group1
echo 1000 >/sys/fs/cgroup/group1/io.weight
dd if=/dev/zero of=/mnt/file1 bs=1M count=1&
DD1=$!
echo $DD1 >/sys/fs/cgroup/group1/cgroup.procs

mkdir /sys/fs/cgroup/group2
echo 100 >/sys/fs/cgroup/group2/io.weight
#echo "259:65536 wbps=500" >/sys/fs/cgroup/group2/io.max
echo "259:65536 wbps=max" >/sys/fs/cgroup/group2/io.max
dd if=/dev/zero of=/mnt/file2 bs=1M count=1&
DD2=$!
echo $DD2 >/sys/fs/cgroup/group2/cgroup.procs

while true; do
 sleep 1
 kill -USR1 $DD1
 kill -USR1 $DD2
 echo  '==='
done
---

and watched the progress of the dd processes in different cgroups. The 1/10
weight difference has no effect with your writeback patches - the situation
after one minute:

3120+1 records in
3120+1 records out
3272392704 bytes (3.3 GB) copied, 63.7119 s, 51.4 MB/s
3217+1 records in
3217+1 records out
3374010368 bytes (3.4 GB) copied, 63.5819 s, 53.1 MB/s

I should add that even without your patches the progress doesn't quite
correspond to the weight ratio:


Forgot to fill in corresponding data for unpatched kernel here:

5962+2 records in
5962+2 records out
6252281856 bytes (6.3 GB) copied, 64.1719 s, 97.4 MB/s
1502+0 records in
1502+0 records out
1574961152 bytes (1.6 GB) copied, 64.207 s, 24.5 MB/s


Thanks for testing this, I'll see what we can do about that. It stands 
to reason that we'll throttle a heavier writer more, statistically. But 
I'm assuming this above test was run basically with just the writes 
going, so no real competition? And hence we end up throttling them 
equally much, destroying the weighting in the process. But for both 
cases, we basically don't pay any attention to cgroup weights.



but still there is noticeable difference to cgroups with different weights.

OTOH blk-throttle combines well with your patches: Limiting one cgroup to
5 M/s results in numbers like:

3883+2 records in
3883+2 records out
4072091648 bytes (4.1 GB) copied, 36.6713 s, 111 MB/s
413+0 records in
413+0 records out
433061888 bytes (433 MB) copied, 36.8939 s, 11.7 MB/s

which is fine and comparable with unpatched kernel. Higher throughput
number is because we do buffered writes and dd reports what it wrote into
page cache. And there is no wonder blk-throttle combines fine - it
throttles bios which happens before we reach writeback throttling
mechanism.


OK, that's good, at least that part works fine. And yes, the throttle 
path is hit before we end up in the make_request_fn, which is where wbt 
drops in.



So I belive this demonstrates that your writeback throttling just doesn't
work well with selective scheduling policy that happens below it because it
can essentially lead to IO priority inversion issues...


It this testing still done on the QD=1 ATA disk? Not too surprising that 
this falls apart, since we have very little room to maneuver. I wonder 
if a normal SATA with NCQ would behave better in this regard. I'll have 
to test a bit and think about how we can best handle this case.


--
Jens Axboe



Re: [PATCH 7/8] wbt: add general throttling mechanism

2016-05-03 Thread Jan Kara
On Tue 03-05-16 17:40:32, Jan Kara wrote:
> On Tue 03-05-16 11:34:10, Jan Kara wrote:
> > Yeah, once I'll hunt down that regression with old disk, I can have a look
> > into how writeback throttling plays together with blkio-controller.
> 
> So I've tried the following script (note that you need cgroup v2 for
> writeback IO to be throttled):
> 
> ---
> mkdir /sys/fs/cgroup/group1
> echo 1000 >/sys/fs/cgroup/group1/io.weight
> dd if=/dev/zero of=/mnt/file1 bs=1M count=1&
> DD1=$!
> echo $DD1 >/sys/fs/cgroup/group1/cgroup.procs
> 
> mkdir /sys/fs/cgroup/group2
> echo 100 >/sys/fs/cgroup/group2/io.weight
> #echo "259:65536 wbps=500" >/sys/fs/cgroup/group2/io.max
> echo "259:65536 wbps=max" >/sys/fs/cgroup/group2/io.max
> dd if=/dev/zero of=/mnt/file2 bs=1M count=1&
> DD2=$!
> echo $DD2 >/sys/fs/cgroup/group2/cgroup.procs
> 
> while true; do
> sleep 1
> kill -USR1 $DD1
> kill -USR1 $DD2
> echo  '==='
> done
> ---
> 
> and watched the progress of the dd processes in different cgroups. The 1/10
> weight difference has no effect with your writeback patches - the situation
> after one minute:
> 
> 3120+1 records in
> 3120+1 records out
> 3272392704 bytes (3.3 GB) copied, 63.7119 s, 51.4 MB/s
> 3217+1 records in
> 3217+1 records out
> 3374010368 bytes (3.4 GB) copied, 63.5819 s, 53.1 MB/s
> 
> I should add that even without your patches the progress doesn't quite
> correspond to the weight ratio:

Forgot to fill in corresponding data for unpatched kernel here:

5962+2 records in
5962+2 records out
6252281856 bytes (6.3 GB) copied, 64.1719 s, 97.4 MB/s
1502+0 records in
1502+0 records out
1574961152 bytes (1.6 GB) copied, 64.207 s, 24.5 MB/s

> but still there is noticeable difference to cgroups with different weights.
> 
> OTOH blk-throttle combines well with your patches: Limiting one cgroup to
> 5 M/s results in numbers like:
> 
> 3883+2 records in
> 3883+2 records out
> 4072091648 bytes (4.1 GB) copied, 36.6713 s, 111 MB/s
> 413+0 records in
> 413+0 records out
> 433061888 bytes (433 MB) copied, 36.8939 s, 11.7 MB/s
> 
> which is fine and comparable with unpatched kernel. Higher throughput
> number is because we do buffered writes and dd reports what it wrote into
> page cache. And there is no wonder blk-throttle combines fine - it
> throttles bios which happens before we reach writeback throttling
> mechanism.
> 
> So I belive this demonstrates that your writeback throttling just doesn't
> work well with selective scheduling policy that happens below it because it
> can essentially lead to IO priority inversion issues...
> 
>   Honza
> -- 
> Jan Kara 
> SUSE Labs, CR
-- 
Jan Kara 
SUSE Labs, CR


Re: [PATCH 7/8] wbt: add general throttling mechanism

2016-05-03 Thread Jan Kara
On Tue 03-05-16 17:40:32, Jan Kara wrote:
> On Tue 03-05-16 11:34:10, Jan Kara wrote:
> > Yeah, once I'll hunt down that regression with old disk, I can have a look
> > into how writeback throttling plays together with blkio-controller.
> 
> So I've tried the following script (note that you need cgroup v2 for
> writeback IO to be throttled):
> 
> ---
> mkdir /sys/fs/cgroup/group1
> echo 1000 >/sys/fs/cgroup/group1/io.weight
> dd if=/dev/zero of=/mnt/file1 bs=1M count=1&
> DD1=$!
> echo $DD1 >/sys/fs/cgroup/group1/cgroup.procs
> 
> mkdir /sys/fs/cgroup/group2
> echo 100 >/sys/fs/cgroup/group2/io.weight
> #echo "259:65536 wbps=500" >/sys/fs/cgroup/group2/io.max
> echo "259:65536 wbps=max" >/sys/fs/cgroup/group2/io.max
> dd if=/dev/zero of=/mnt/file2 bs=1M count=1&
> DD2=$!
> echo $DD2 >/sys/fs/cgroup/group2/cgroup.procs
> 
> while true; do
> sleep 1
> kill -USR1 $DD1
> kill -USR1 $DD2
> echo  '==='
> done
> ---
> 
> and watched the progress of the dd processes in different cgroups. The 1/10
> weight difference has no effect with your writeback patches - the situation
> after one minute:
> 
> 3120+1 records in
> 3120+1 records out
> 3272392704 bytes (3.3 GB) copied, 63.7119 s, 51.4 MB/s
> 3217+1 records in
> 3217+1 records out
> 3374010368 bytes (3.4 GB) copied, 63.5819 s, 53.1 MB/s
> 
> I should add that even without your patches the progress doesn't quite
> correspond to the weight ratio:

Forgot to fill in corresponding data for unpatched kernel here:

5962+2 records in
5962+2 records out
6252281856 bytes (6.3 GB) copied, 64.1719 s, 97.4 MB/s
1502+0 records in
1502+0 records out
1574961152 bytes (1.6 GB) copied, 64.207 s, 24.5 MB/s

> but still there is noticeable difference to cgroups with different weights.
> 
> OTOH blk-throttle combines well with your patches: Limiting one cgroup to
> 5 M/s results in numbers like:
> 
> 3883+2 records in
> 3883+2 records out
> 4072091648 bytes (4.1 GB) copied, 36.6713 s, 111 MB/s
> 413+0 records in
> 413+0 records out
> 433061888 bytes (433 MB) copied, 36.8939 s, 11.7 MB/s
> 
> which is fine and comparable with unpatched kernel. Higher throughput
> number is because we do buffered writes and dd reports what it wrote into
> page cache. And there is no wonder blk-throttle combines fine - it
> throttles bios which happens before we reach writeback throttling
> mechanism.
> 
> So I belive this demonstrates that your writeback throttling just doesn't
> work well with selective scheduling policy that happens below it because it
> can essentially lead to IO priority inversion issues...
> 
>   Honza
> -- 
> Jan Kara 
> SUSE Labs, CR
-- 
Jan Kara 
SUSE Labs, CR


Re: [PATCH 7/8] wbt: add general throttling mechanism

2016-05-03 Thread Jan Kara
On Tue 03-05-16 11:34:10, Jan Kara wrote:
> Yeah, once I'll hunt down that regression with old disk, I can have a look
> into how writeback throttling plays together with blkio-controller.

So I've tried the following script (note that you need cgroup v2 for
writeback IO to be throttled):

---
mkdir /sys/fs/cgroup/group1
echo 1000 >/sys/fs/cgroup/group1/io.weight
dd if=/dev/zero of=/mnt/file1 bs=1M count=1&
DD1=$!
echo $DD1 >/sys/fs/cgroup/group1/cgroup.procs

mkdir /sys/fs/cgroup/group2
echo 100 >/sys/fs/cgroup/group2/io.weight
#echo "259:65536 wbps=500" >/sys/fs/cgroup/group2/io.max
echo "259:65536 wbps=max" >/sys/fs/cgroup/group2/io.max
dd if=/dev/zero of=/mnt/file2 bs=1M count=1&
DD2=$!
echo $DD2 >/sys/fs/cgroup/group2/cgroup.procs

while true; do
sleep 1
kill -USR1 $DD1
kill -USR1 $DD2
echo  '==='
done
---

and watched the progress of the dd processes in different cgroups. The 1/10
weight difference has no effect with your writeback patches - the situation
after one minute:

3120+1 records in
3120+1 records out
3272392704 bytes (3.3 GB) copied, 63.7119 s, 51.4 MB/s
3217+1 records in
3217+1 records out
3374010368 bytes (3.4 GB) copied, 63.5819 s, 53.1 MB/s

I should add that even without your patches the progress doesn't quite
correspond to the weight ratio:
...

but still there is noticeable difference to cgroups with different weights.

OTOH blk-throttle combines well with your patches: Limiting one cgroup to
5 M/s results in numbers like:

3883+2 records in
3883+2 records out
4072091648 bytes (4.1 GB) copied, 36.6713 s, 111 MB/s
413+0 records in
413+0 records out
433061888 bytes (433 MB) copied, 36.8939 s, 11.7 MB/s

which is fine and comparable with unpatched kernel. Higher throughput
number is because we do buffered writes and dd reports what it wrote into
page cache. And there is no wonder blk-throttle combines fine - it
throttles bios which happens before we reach writeback throttling
mechanism.

So I belive this demonstrates that your writeback throttling just doesn't
work well with selective scheduling policy that happens below it because it
can essentially lead to IO priority inversion issues...

Honza
-- 
Jan Kara 
SUSE Labs, CR


Re: [PATCH 7/8] wbt: add general throttling mechanism

2016-05-03 Thread Jan Kara
On Tue 03-05-16 11:34:10, Jan Kara wrote:
> Yeah, once I'll hunt down that regression with old disk, I can have a look
> into how writeback throttling plays together with blkio-controller.

So I've tried the following script (note that you need cgroup v2 for
writeback IO to be throttled):

---
mkdir /sys/fs/cgroup/group1
echo 1000 >/sys/fs/cgroup/group1/io.weight
dd if=/dev/zero of=/mnt/file1 bs=1M count=1&
DD1=$!
echo $DD1 >/sys/fs/cgroup/group1/cgroup.procs

mkdir /sys/fs/cgroup/group2
echo 100 >/sys/fs/cgroup/group2/io.weight
#echo "259:65536 wbps=500" >/sys/fs/cgroup/group2/io.max
echo "259:65536 wbps=max" >/sys/fs/cgroup/group2/io.max
dd if=/dev/zero of=/mnt/file2 bs=1M count=1&
DD2=$!
echo $DD2 >/sys/fs/cgroup/group2/cgroup.procs

while true; do
sleep 1
kill -USR1 $DD1
kill -USR1 $DD2
echo  '==='
done
---

and watched the progress of the dd processes in different cgroups. The 1/10
weight difference has no effect with your writeback patches - the situation
after one minute:

3120+1 records in
3120+1 records out
3272392704 bytes (3.3 GB) copied, 63.7119 s, 51.4 MB/s
3217+1 records in
3217+1 records out
3374010368 bytes (3.4 GB) copied, 63.5819 s, 53.1 MB/s

I should add that even without your patches the progress doesn't quite
correspond to the weight ratio:
...

but still there is noticeable difference to cgroups with different weights.

OTOH blk-throttle combines well with your patches: Limiting one cgroup to
5 M/s results in numbers like:

3883+2 records in
3883+2 records out
4072091648 bytes (4.1 GB) copied, 36.6713 s, 111 MB/s
413+0 records in
413+0 records out
433061888 bytes (433 MB) copied, 36.8939 s, 11.7 MB/s

which is fine and comparable with unpatched kernel. Higher throughput
number is because we do buffered writes and dd reports what it wrote into
page cache. And there is no wonder blk-throttle combines fine - it
throttles bios which happens before we reach writeback throttling
mechanism.

So I belive this demonstrates that your writeback throttling just doesn't
work well with selective scheduling policy that happens below it because it
can essentially lead to IO priority inversion issues...

Honza
-- 
Jan Kara 
SUSE Labs, CR


Re: [PATCH 7/8] wbt: add general throttling mechanism

2016-05-03 Thread Jens Axboe

On 05/03/2016 09:22 AM, Jan Kara wrote:

On Tue 03-05-16 08:23:27, Jens Axboe wrote:

On 05/03/2016 03:34 AM, Jan Kara wrote:

On Thu 28-04-16 12:53:50, Jens Axboe wrote:

2) As far as I can see in patch 8/8, you have plugged the throttling above
the IO scheduler. When there are e.g. multiple cgroups with different IO
limits operating, this throttling can lead to strange results (like a
cgroup with low limit using up all available background "slots" and thus
effectively stopping background writeback for other cgroups)? So won't
it make more sense to plug this below the IO scheduler? Now I understand
there may be other problems with this but I think we should put more
though to that and provide some justification in changelogs.


One complexity is that we have to do this early for blk-mq, since once you
get a request, you're already sitting on the hw tag. CoDel should actually
work fine at each hop, so hopefully this will as well.


OK, I see. But then this suggests that any IO scheduling and / or
cgroup-related throttling should happen before we get a request for blk-mq
as well? And then we can still do writeback throttling below that layer?


Not necessarily. For IO scheduling, basically we care about two parts:

1) Are you allowed to allocate the resources to queue some IO
2) Are you allowed to dispatch


But then it seems suboptimal to waste a relatively scarce resource (which
HW tag is AFAIU) just because you happen to run from a cgroup that is
bandwidth limited and thus are not allowed to dispatch?


For some cases, you are absolutely right, and #1 is the main one. For 
your case of QD=1, that's obviously the case. For SATA, it's a bit more 
grey zone, and for others (nvme, scsi, etc), it's not really a scarce 
resource so #2 is the bigger part of it.


--
Jens Axboe



Re: [PATCH 7/8] wbt: add general throttling mechanism

2016-05-03 Thread Jens Axboe

On 05/03/2016 09:22 AM, Jan Kara wrote:

On Tue 03-05-16 08:23:27, Jens Axboe wrote:

On 05/03/2016 03:34 AM, Jan Kara wrote:

On Thu 28-04-16 12:53:50, Jens Axboe wrote:

2) As far as I can see in patch 8/8, you have plugged the throttling above
the IO scheduler. When there are e.g. multiple cgroups with different IO
limits operating, this throttling can lead to strange results (like a
cgroup with low limit using up all available background "slots" and thus
effectively stopping background writeback for other cgroups)? So won't
it make more sense to plug this below the IO scheduler? Now I understand
there may be other problems with this but I think we should put more
though to that and provide some justification in changelogs.


One complexity is that we have to do this early for blk-mq, since once you
get a request, you're already sitting on the hw tag. CoDel should actually
work fine at each hop, so hopefully this will as well.


OK, I see. But then this suggests that any IO scheduling and / or
cgroup-related throttling should happen before we get a request for blk-mq
as well? And then we can still do writeback throttling below that layer?


Not necessarily. For IO scheduling, basically we care about two parts:

1) Are you allowed to allocate the resources to queue some IO
2) Are you allowed to dispatch


But then it seems suboptimal to waste a relatively scarce resource (which
HW tag is AFAIU) just because you happen to run from a cgroup that is
bandwidth limited and thus are not allowed to dispatch?


For some cases, you are absolutely right, and #1 is the main one. For 
your case of QD=1, that's obviously the case. For SATA, it's a bit more 
grey zone, and for others (nvme, scsi, etc), it's not really a scarce 
resource so #2 is the bigger part of it.


--
Jens Axboe



Re: [PATCH 7/8] wbt: add general throttling mechanism

2016-05-03 Thread Jan Kara
On Tue 03-05-16 08:23:27, Jens Axboe wrote:
> On 05/03/2016 03:34 AM, Jan Kara wrote:
> >On Thu 28-04-16 12:53:50, Jens Axboe wrote:
> >>>2) As far as I can see in patch 8/8, you have plugged the throttling above
> >>>the IO scheduler. When there are e.g. multiple cgroups with different 
> >>> IO
> >>>limits operating, this throttling can lead to strange results (like a
> >>>cgroup with low limit using up all available background "slots" and 
> >>> thus
> >>>effectively stopping background writeback for other cgroups)? So won't
> >>>it make more sense to plug this below the IO scheduler? Now I 
> >>> understand
> >>>there may be other problems with this but I think we should put more
> >>>though to that and provide some justification in changelogs.
> >>
> >>One complexity is that we have to do this early for blk-mq, since once you
> >>get a request, you're already sitting on the hw tag. CoDel should actually
> >>work fine at each hop, so hopefully this will as well.
> >
> >OK, I see. But then this suggests that any IO scheduling and / or
> >cgroup-related throttling should happen before we get a request for blk-mq
> >as well? And then we can still do writeback throttling below that layer?
> 
> Not necessarily. For IO scheduling, basically we care about two parts:
> 
> 1) Are you allowed to allocate the resources to queue some IO
> 2) Are you allowed to dispatch

But then it seems suboptimal to waste a relatively scarce resource (which
HW tag is AFAIU) just because you happen to run from a cgroup that is
bandwidth limited and thus are not allowed to dispatch?

Honza
-- 
Jan Kara 
SUSE Labs, CR


Re: [PATCH 7/8] wbt: add general throttling mechanism

2016-05-03 Thread Jan Kara
On Tue 03-05-16 08:23:27, Jens Axboe wrote:
> On 05/03/2016 03:34 AM, Jan Kara wrote:
> >On Thu 28-04-16 12:53:50, Jens Axboe wrote:
> >>>2) As far as I can see in patch 8/8, you have plugged the throttling above
> >>>the IO scheduler. When there are e.g. multiple cgroups with different 
> >>> IO
> >>>limits operating, this throttling can lead to strange results (like a
> >>>cgroup with low limit using up all available background "slots" and 
> >>> thus
> >>>effectively stopping background writeback for other cgroups)? So won't
> >>>it make more sense to plug this below the IO scheduler? Now I 
> >>> understand
> >>>there may be other problems with this but I think we should put more
> >>>though to that and provide some justification in changelogs.
> >>
> >>One complexity is that we have to do this early for blk-mq, since once you
> >>get a request, you're already sitting on the hw tag. CoDel should actually
> >>work fine at each hop, so hopefully this will as well.
> >
> >OK, I see. But then this suggests that any IO scheduling and / or
> >cgroup-related throttling should happen before we get a request for blk-mq
> >as well? And then we can still do writeback throttling below that layer?
> 
> Not necessarily. For IO scheduling, basically we care about two parts:
> 
> 1) Are you allowed to allocate the resources to queue some IO
> 2) Are you allowed to dispatch

But then it seems suboptimal to waste a relatively scarce resource (which
HW tag is AFAIU) just because you happen to run from a cgroup that is
bandwidth limited and thus are not allowed to dispatch?

Honza
-- 
Jan Kara 
SUSE Labs, CR


Re: [PATCH 7/8] wbt: add general throttling mechanism

2016-05-03 Thread Jens Axboe

On 05/03/2016 03:34 AM, Jan Kara wrote:

On Thu 28-04-16 12:53:50, Jens Axboe wrote:

2) As far as I can see in patch 8/8, you have plugged the throttling above
the IO scheduler. When there are e.g. multiple cgroups with different IO
limits operating, this throttling can lead to strange results (like a
cgroup with low limit using up all available background "slots" and thus
effectively stopping background writeback for other cgroups)? So won't
it make more sense to plug this below the IO scheduler? Now I understand
there may be other problems with this but I think we should put more
though to that and provide some justification in changelogs.


One complexity is that we have to do this early for blk-mq, since once you
get a request, you're already sitting on the hw tag. CoDel should actually
work fine at each hop, so hopefully this will as well.


OK, I see. But then this suggests that any IO scheduling and / or
cgroup-related throttling should happen before we get a request for blk-mq
as well? And then we can still do writeback throttling below that layer?


Not necessarily. For IO scheduling, basically we care about two parts:

1) Are you allowed to allocate the resources to queue some IO
2) Are you allowed to dispatch

The latter part can still be handled independently, and the former as 
well of course, wbt just deals with throttling back #1 for buffered writes.



But yes, fairness is something that we have to pay attention to. Right now
the wait queue has no priority associated with it, that should probably be
improved to be able to wakeup in a more appropriate order.
Needs testing, but hopefully it works out since if you do run into
starvation, then you'll go to the back of the queue for the next attempt.


Yeah, once I'll hunt down that regression with old disk, I can have a look
into how writeback throttling plays together with blkio-controller.


Thanks!

--
Jens Axboe



Re: [PATCH 7/8] wbt: add general throttling mechanism

2016-05-03 Thread Jens Axboe

On 05/03/2016 03:34 AM, Jan Kara wrote:

On Thu 28-04-16 12:53:50, Jens Axboe wrote:

2) As far as I can see in patch 8/8, you have plugged the throttling above
the IO scheduler. When there are e.g. multiple cgroups with different IO
limits operating, this throttling can lead to strange results (like a
cgroup with low limit using up all available background "slots" and thus
effectively stopping background writeback for other cgroups)? So won't
it make more sense to plug this below the IO scheduler? Now I understand
there may be other problems with this but I think we should put more
though to that and provide some justification in changelogs.


One complexity is that we have to do this early for blk-mq, since once you
get a request, you're already sitting on the hw tag. CoDel should actually
work fine at each hop, so hopefully this will as well.


OK, I see. But then this suggests that any IO scheduling and / or
cgroup-related throttling should happen before we get a request for blk-mq
as well? And then we can still do writeback throttling below that layer?


Not necessarily. For IO scheduling, basically we care about two parts:

1) Are you allowed to allocate the resources to queue some IO
2) Are you allowed to dispatch

The latter part can still be handled independently, and the former as 
well of course, wbt just deals with throttling back #1 for buffered writes.



But yes, fairness is something that we have to pay attention to. Right now
the wait queue has no priority associated with it, that should probably be
improved to be able to wakeup in a more appropriate order.
Needs testing, but hopefully it works out since if you do run into
starvation, then you'll go to the back of the queue for the next attempt.


Yeah, once I'll hunt down that regression with old disk, I can have a look
into how writeback throttling plays together with blkio-controller.


Thanks!

--
Jens Axboe



Re: [PATCH 7/8] wbt: add general throttling mechanism

2016-05-03 Thread Jan Kara
On Thu 28-04-16 12:53:50, Jens Axboe wrote:
> >2) As far as I can see in patch 8/8, you have plugged the throttling above
> >the IO scheduler. When there are e.g. multiple cgroups with different IO
> >limits operating, this throttling can lead to strange results (like a
> >cgroup with low limit using up all available background "slots" and thus
> >effectively stopping background writeback for other cgroups)? So won't
> >it make more sense to plug this below the IO scheduler? Now I understand
> >there may be other problems with this but I think we should put more
> >though to that and provide some justification in changelogs.
> 
> One complexity is that we have to do this early for blk-mq, since once you
> get a request, you're already sitting on the hw tag. CoDel should actually
> work fine at each hop, so hopefully this will as well.

OK, I see. But then this suggests that any IO scheduling and / or
cgroup-related throttling should happen before we get a request for blk-mq
as well? And then we can still do writeback throttling below that layer?

> But yes, fairness is something that we have to pay attention to. Right now
> the wait queue has no priority associated with it, that should probably be
> improved to be able to wakeup in a more appropriate order.
> Needs testing, but hopefully it works out since if you do run into
> starvation, then you'll go to the back of the queue for the next attempt.

Yeah, once I'll hunt down that regression with old disk, I can have a look
into how writeback throttling plays together with blkio-controller.

> >>+static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
> >>+{
> >>+   u64 thislat;
> >>+
> >>+   /*
> >>+* If our stored sync issue exceeds the window size, or it
> >>+* exceeds our min target AND we haven't logged any entries,
> >>+* flag the latency as exceeded.
> >>+*/
> >>+   thislat = rwb_sync_issue_lat(rwb);
> >>+   if (thislat > rwb->cur_win_nsec ||
> >>+   (thislat > rwb->min_lat_nsec && !stat[0].nr_samples)) {
> >>+   trace_wbt_lat(rwb->bdi, thislat);
> >>+   return LAT_EXCEEDED;
> >>+   }
> >
> >So I'm trying to wrap my head around this. If I read the code right,
> >rwb_sync_issue_lat() this returns time that has passed since issuing sync
> >request that is still running. We basically randomly pick which sync
> >request we track as we always start tracking a sync request when some is
> >issued and we are not tracking any at that moment. This is to detect the
> >case when latency of sync IO is very large compared to measurement window
> >so we would not get enough samples to make it valid?
> 
> Right, that's pretty close. Since wbt uses the completion latencies to make
> decisions, if an IO hasn't completed, we don't know about it. If the device
> is flooded with writes, and we then issue a read, maybe that read won't
> complete for multiple monitoring windows. During that time, we keep thinking
> everything is fine. But in reality, it's not completing because of the write
> load. So this logic attempts to track the single sync IO request case. If
> that exceeds a monitoring window of time and we saw no other sync IO in that
> window, then treat that case as if it had completed but exceeded the min
> latency. And then scale back.
> 
> We'll always treat a state sample with 1 read as valuable, but for this
> case, we don't have that sample until it completes.
> 
> Does that make more sense?

OK, makes sense. Thanks for explanation.

Honza
-- 
Jan Kara 
SUSE Labs, CR


Re: [PATCH 7/8] wbt: add general throttling mechanism

2016-05-03 Thread Jan Kara
On Thu 28-04-16 12:53:50, Jens Axboe wrote:
> >2) As far as I can see in patch 8/8, you have plugged the throttling above
> >the IO scheduler. When there are e.g. multiple cgroups with different IO
> >limits operating, this throttling can lead to strange results (like a
> >cgroup with low limit using up all available background "slots" and thus
> >effectively stopping background writeback for other cgroups)? So won't
> >it make more sense to plug this below the IO scheduler? Now I understand
> >there may be other problems with this but I think we should put more
> >though to that and provide some justification in changelogs.
> 
> One complexity is that we have to do this early for blk-mq, since once you
> get a request, you're already sitting on the hw tag. CoDel should actually
> work fine at each hop, so hopefully this will as well.

OK, I see. But then this suggests that any IO scheduling and / or
cgroup-related throttling should happen before we get a request for blk-mq
as well? And then we can still do writeback throttling below that layer?

> But yes, fairness is something that we have to pay attention to. Right now
> the wait queue has no priority associated with it, that should probably be
> improved to be able to wakeup in a more appropriate order.
> Needs testing, but hopefully it works out since if you do run into
> starvation, then you'll go to the back of the queue for the next attempt.

Yeah, once I'll hunt down that regression with old disk, I can have a look
into how writeback throttling plays together with blkio-controller.

> >>+static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
> >>+{
> >>+   u64 thislat;
> >>+
> >>+   /*
> >>+* If our stored sync issue exceeds the window size, or it
> >>+* exceeds our min target AND we haven't logged any entries,
> >>+* flag the latency as exceeded.
> >>+*/
> >>+   thislat = rwb_sync_issue_lat(rwb);
> >>+   if (thislat > rwb->cur_win_nsec ||
> >>+   (thislat > rwb->min_lat_nsec && !stat[0].nr_samples)) {
> >>+   trace_wbt_lat(rwb->bdi, thislat);
> >>+   return LAT_EXCEEDED;
> >>+   }
> >
> >So I'm trying to wrap my head around this. If I read the code right,
> >rwb_sync_issue_lat() this returns time that has passed since issuing sync
> >request that is still running. We basically randomly pick which sync
> >request we track as we always start tracking a sync request when some is
> >issued and we are not tracking any at that moment. This is to detect the
> >case when latency of sync IO is very large compared to measurement window
> >so we would not get enough samples to make it valid?
> 
> Right, that's pretty close. Since wbt uses the completion latencies to make
> decisions, if an IO hasn't completed, we don't know about it. If the device
> is flooded with writes, and we then issue a read, maybe that read won't
> complete for multiple monitoring windows. During that time, we keep thinking
> everything is fine. But in reality, it's not completing because of the write
> load. So this logic attempts to track the single sync IO request case. If
> that exceeds a monitoring window of time and we saw no other sync IO in that
> window, then treat that case as if it had completed but exceeded the min
> latency. And then scale back.
> 
> We'll always treat a state sample with 1 read as valuable, but for this
> case, we don't have that sample until it completes.
> 
> Does that make more sense?

OK, makes sense. Thanks for explanation.

Honza
-- 
Jan Kara 
SUSE Labs, CR


Re: [PATCH 7/8] wbt: add general throttling mechanism

2016-04-28 Thread Jens Axboe

On 04/28/2016 12:53 PM, Jens Axboe wrote:



Probably the comment could explain more of "why we do this?" than pure
"what we do".


Agree, if you find it confusing, then it needs updating. I'll update the
comment.


http://git.kernel.dk/cgit/linux-block/commit/?h=wb-buf-throttle

This should address your review comments, I believe.

--
Jens Axboe



Re: [PATCH 7/8] wbt: add general throttling mechanism

2016-04-28 Thread Jens Axboe

On 04/28/2016 12:53 PM, Jens Axboe wrote:



Probably the comment could explain more of "why we do this?" than pure
"what we do".


Agree, if you find it confusing, then it needs updating. I'll update the
comment.


http://git.kernel.dk/cgit/linux-block/commit/?h=wb-buf-throttle

This should address your review comments, I believe.

--
Jens Axboe



Re: [PATCH 7/8] wbt: add general throttling mechanism

2016-04-28 Thread Jens Axboe

On 04/28/2016 05:05 AM, Jan Kara wrote:

I have some comments below...


+struct rq_wb {
+   /*
+* Settings that govern how we throttle
+*/
+   unsigned int wb_background; /* background writeback */
+   unsigned int wb_normal; /* normal writeback */
+   unsigned int wb_max;/* max throughput writeback */
+   unsigned int scale_step;
+
+   u64 win_nsec;   /* default window size */
+   u64 cur_win_nsec;   /* current window size */
+
+   unsigned int unknown_cnt;


It would be useful to have a comment here explaining that 'unknown_cnt' is
a number of consecutive periods in which we didn't have enough data to
decide about queue scaling (at least this is what I understood from the
code).


Agree, I'll add that comment.


+
+   struct timer_list window_timer;
+
+   s64 sync_issue;
+   void *sync_cookie;


So I'm somewhat wondering: What is protecting consistency of this
structure? The limits, scale_step, cur_win_nsec, unknown_cnt are updated only
from timer so those should be safe. However sync_issue & sync_cookie are
accessed from IO submission and completion path and there we need some
protection to keep those two in sync. It seems q->queue_lock should mostly
achieve those except for blk-mq submission path calling wbt_wait() which
doesn't hold queue_lock.


Right, it's designed such that only the timer will be updating these 
values, and that part is serialized. For sync_issue and sync_cookie, the 
important part there is that we never dereference sync_cookie. That's 
why it's a void * now. So we just use it as a hint. And yes, if the IO 
happens to complete at just the time we are looking at it, we could get 
a false positive or false negative. That's going to be noise, and 
nothing we need to worry about. It's deliberate that I don't do any 
locking for that, the only reason we pass in the queue_lock is to be 
able to drop it for sleeping.



It seems you were aware of the possible races and the code handles them
mostly fine (although I wouldn't bet too much there is not some weird
corner case). However it would be good to comment on this somewhere and
explain what the rules for these two fields are.


Agree, it does warrant a good code comment. If we look at the edge 
cases, one would be:


We look at sync_issue and decide that we're now too late, at the same 
time as the sync_cookie gets cleared. For this case, we'll count it as 
an exceed and scale down. In reality we were late, so it doesn't matter. 
Even if it was the exact time, it's still prudent to scale down as we're 
going to miss soon.


A more worrying case would be two issues that happen at the same time, 
and only one gets set. Let's assume the one that doesn't get set is the 
one that ends up taking a long time to complete. We'll miss scaling down 
in this case, we'll only notice when it completes and shows up in the 
stats. Not idea, but it's still being handled in the fashion that was 
originally intended, at completion time.



diff --git a/lib/wbt.c b/lib/wbt.c
new file mode 100644
index ..650da911f24f
--- /dev/null
+++ b/lib/wbt.c
@@ -0,0 +1,524 @@
+/*
+ * buffered writeback throttling. losely based on CoDel. We can't drop
+ * packets for IO scheduling, so the logic is something like this:
+ *
+ * - Monitor latencies in a defined window of time.
+ * - If the minimum latency in the above window exceeds some target, increment
+ *   scaling step and scale down queue depth by a factor of 2x. The monitoring
+ *   window is then shrunk to 100 / sqrt(scaling step + 1).
+ * - For any window where we don't have solid data on what the latencies
+ *   look like, retain status quo.
+ * - If latencies look good, decrement scaling step.


I'm wondering about two things:

1) There is a logic somewhat in this direction in blk_queue_start_tag().
Probably it should be removed after your patches land?


You're referring to the read/write separation in the legacy tagging? Yes 
agree, we can kill that once this goes in.



2) As far as I can see in patch 8/8, you have plugged the throttling above
the IO scheduler. When there are e.g. multiple cgroups with different IO
limits operating, this throttling can lead to strange results (like a
cgroup with low limit using up all available background "slots" and thus
effectively stopping background writeback for other cgroups)? So won't
it make more sense to plug this below the IO scheduler? Now I understand
there may be other problems with this but I think we should put more
though to that and provide some justification in changelogs.


One complexity is that we have to do this early for blk-mq, since once 
you get a request, you're already sitting on the hw tag. CoDel should 
actually work fine at each hop, so hopefully this will as well.


But yes, fairness is something that we have to pay attention to. Right 
now the 

Re: [PATCH 7/8] wbt: add general throttling mechanism

2016-04-28 Thread Jens Axboe

On 04/28/2016 05:05 AM, Jan Kara wrote:

I have some comments below...


+struct rq_wb {
+   /*
+* Settings that govern how we throttle
+*/
+   unsigned int wb_background; /* background writeback */
+   unsigned int wb_normal; /* normal writeback */
+   unsigned int wb_max;/* max throughput writeback */
+   unsigned int scale_step;
+
+   u64 win_nsec;   /* default window size */
+   u64 cur_win_nsec;   /* current window size */
+
+   unsigned int unknown_cnt;


It would be useful to have a comment here explaining that 'unknown_cnt' is
a number of consecutive periods in which we didn't have enough data to
decide about queue scaling (at least this is what I understood from the
code).


Agree, I'll add that comment.


+
+   struct timer_list window_timer;
+
+   s64 sync_issue;
+   void *sync_cookie;


So I'm somewhat wondering: What is protecting consistency of this
structure? The limits, scale_step, cur_win_nsec, unknown_cnt are updated only
from timer so those should be safe. However sync_issue & sync_cookie are
accessed from IO submission and completion path and there we need some
protection to keep those two in sync. It seems q->queue_lock should mostly
achieve those except for blk-mq submission path calling wbt_wait() which
doesn't hold queue_lock.


Right, it's designed such that only the timer will be updating these 
values, and that part is serialized. For sync_issue and sync_cookie, the 
important part there is that we never dereference sync_cookie. That's 
why it's a void * now. So we just use it as a hint. And yes, if the IO 
happens to complete at just the time we are looking at it, we could get 
a false positive or false negative. That's going to be noise, and 
nothing we need to worry about. It's deliberate that I don't do any 
locking for that, the only reason we pass in the queue_lock is to be 
able to drop it for sleeping.



It seems you were aware of the possible races and the code handles them
mostly fine (although I wouldn't bet too much there is not some weird
corner case). However it would be good to comment on this somewhere and
explain what the rules for these two fields are.


Agree, it does warrant a good code comment. If we look at the edge 
cases, one would be:


We look at sync_issue and decide that we're now too late, at the same 
time as the sync_cookie gets cleared. For this case, we'll count it as 
an exceed and scale down. In reality we were late, so it doesn't matter. 
Even if it was the exact time, it's still prudent to scale down as we're 
going to miss soon.


A more worrying case would be two issues that happen at the same time, 
and only one gets set. Let's assume the one that doesn't get set is the 
one that ends up taking a long time to complete. We'll miss scaling down 
in this case, we'll only notice when it completes and shows up in the 
stats. Not idea, but it's still being handled in the fashion that was 
originally intended, at completion time.



diff --git a/lib/wbt.c b/lib/wbt.c
new file mode 100644
index ..650da911f24f
--- /dev/null
+++ b/lib/wbt.c
@@ -0,0 +1,524 @@
+/*
+ * buffered writeback throttling. losely based on CoDel. We can't drop
+ * packets for IO scheduling, so the logic is something like this:
+ *
+ * - Monitor latencies in a defined window of time.
+ * - If the minimum latency in the above window exceeds some target, increment
+ *   scaling step and scale down queue depth by a factor of 2x. The monitoring
+ *   window is then shrunk to 100 / sqrt(scaling step + 1).
+ * - For any window where we don't have solid data on what the latencies
+ *   look like, retain status quo.
+ * - If latencies look good, decrement scaling step.


I'm wondering about two things:

1) There is a logic somewhat in this direction in blk_queue_start_tag().
Probably it should be removed after your patches land?


You're referring to the read/write separation in the legacy tagging? Yes 
agree, we can kill that once this goes in.



2) As far as I can see in patch 8/8, you have plugged the throttling above
the IO scheduler. When there are e.g. multiple cgroups with different IO
limits operating, this throttling can lead to strange results (like a
cgroup with low limit using up all available background "slots" and thus
effectively stopping background writeback for other cgroups)? So won't
it make more sense to plug this below the IO scheduler? Now I understand
there may be other problems with this but I think we should put more
though to that and provide some justification in changelogs.


One complexity is that we have to do this early for blk-mq, since once 
you get a request, you're already sitting on the hw tag. CoDel should 
actually work fine at each hop, so hopefully this will as well.


But yes, fairness is something that we have to pay attention to. Right 
now the 

Re: [PATCH 7/8] wbt: add general throttling mechanism

2016-04-28 Thread Jan Kara
On Tue 26-04-16 09:55:30, Jens Axboe wrote:
> We can hook this up to the block layer, to help throttle buffered
> writes. Or NFS can tap into it, to accomplish the same.
> 
> wbt registers a few trace points that can be used to track what is
> happening in the system:
> 
> wbt_lat: 259:0: latency 2446318
> wbt_stat: 259:0: rmean=2446318, rmin=2446318, rmax=2446318, rsamples=1,
>wmean=518866, wmin=15522, wmax=5330353, wsamples=57
> wbt_step: 259:0: step down: step=1, window=72727272, background=8, normal=16, 
> max=32
> 
> This shows a sync issue event (wbt_lat) that exceeded it's time. wbt_stat
> dumps the current read/write stats for that window, and wbt_step shows a
> step down event where we now scale back writes. Each trace includes the
> device, 259:0 in this case.

I have some comments below...

> +struct rq_wb {
> + /*
> +  * Settings that govern how we throttle
> +  */
> + unsigned int wb_background; /* background writeback */
> + unsigned int wb_normal; /* normal writeback */
> + unsigned int wb_max;/* max throughput writeback */
> + unsigned int scale_step;
> +
> + u64 win_nsec;   /* default window size */
> + u64 cur_win_nsec;   /* current window size */
> +
> + unsigned int unknown_cnt;

It would be useful to have a comment here explaining that 'unknown_cnt' is
a number of consecutive periods in which we didn't have enough data to
decide about queue scaling (at least this is what I understood from the
code).

> +
> + struct timer_list window_timer;
> +
> + s64 sync_issue;
> + void *sync_cookie;

So I'm somewhat wondering: What is protecting consistency of this
structure? The limits, scale_step, cur_win_nsec, unknown_cnt are updated only
from timer so those should be safe. However sync_issue & sync_cookie are
accessed from IO submission and completion path and there we need some
protection to keep those two in sync. It seems q->queue_lock should mostly
achieve those except for blk-mq submission path calling wbt_wait() which
doesn't hold queue_lock.

It seems you were aware of the possible races and the code handles them
mostly fine (although I wouldn't bet too much there is not some weird
corner case). However it would be good to comment on this somewhere and
explain what the rules for these two fields are.

> +
> + unsigned int wc;
> + unsigned int queue_depth;
> +
> + unsigned long last_issue;   /* last non-throttled issue */
> + unsigned long last_comp;/* last non-throttled comp */
> + unsigned long min_lat_nsec;
> + struct backing_dev_info *bdi;
> + struct request_queue *q;
> + wait_queue_head_t wait;
> + atomic_t inflight;
> +
> + struct wb_stat_ops *stat_ops;
> + void *ops_data;
> +};
...
> diff --git a/lib/wbt.c b/lib/wbt.c
> new file mode 100644
> index ..650da911f24f
> --- /dev/null
> +++ b/lib/wbt.c
> @@ -0,0 +1,524 @@
> +/*
> + * buffered writeback throttling. losely based on CoDel. We can't drop
> + * packets for IO scheduling, so the logic is something like this:
> + *
> + * - Monitor latencies in a defined window of time.
> + * - If the minimum latency in the above window exceeds some target, 
> increment
> + *   scaling step and scale down queue depth by a factor of 2x. The 
> monitoring
> + *   window is then shrunk to 100 / sqrt(scaling step + 1).
> + * - For any window where we don't have solid data on what the latencies
> + *   look like, retain status quo.
> + * - If latencies look good, decrement scaling step.

I'm wondering about two things:

1) There is a logic somewhat in this direction in blk_queue_start_tag().
   Probably it should be removed after your patches land?

2) As far as I can see in patch 8/8, you have plugged the throttling above
   the IO scheduler. When there are e.g. multiple cgroups with different IO
   limits operating, this throttling can lead to strange results (like a
   cgroup with low limit using up all available background "slots" and thus
   effectively stopping background writeback for other cgroups)? So won't
   it make more sense to plug this below the IO scheduler? Now I understand
   there may be other problems with this but I think we should put more
   though to that and provide some justification in changelogs.

> +static void calc_wb_limits(struct rq_wb *rwb)
> +{
> + unsigned int depth;
> +
> + if (!rwb->min_lat_nsec) {
> + rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0;
> + return;
> + }
> +
> + depth = min_t(unsigned int, RWB_MAX_DEPTH, rwb->queue_depth);
> +
> + /*
> +  * Reduce max depth by 50%, and re-calculate normal/bg based on that
> +  */

The comment looks a bit out of place here since we don't reduce max depth
here. We just use whatever is set in scale_step...

> + rwb->wb_max = 1 + ((depth - 1) >> min(31U, 

Re: [PATCH 7/8] wbt: add general throttling mechanism

2016-04-28 Thread Jan Kara
On Tue 26-04-16 09:55:30, Jens Axboe wrote:
> We can hook this up to the block layer, to help throttle buffered
> writes. Or NFS can tap into it, to accomplish the same.
> 
> wbt registers a few trace points that can be used to track what is
> happening in the system:
> 
> wbt_lat: 259:0: latency 2446318
> wbt_stat: 259:0: rmean=2446318, rmin=2446318, rmax=2446318, rsamples=1,
>wmean=518866, wmin=15522, wmax=5330353, wsamples=57
> wbt_step: 259:0: step down: step=1, window=72727272, background=8, normal=16, 
> max=32
> 
> This shows a sync issue event (wbt_lat) that exceeded it's time. wbt_stat
> dumps the current read/write stats for that window, and wbt_step shows a
> step down event where we now scale back writes. Each trace includes the
> device, 259:0 in this case.

I have some comments below...

> +struct rq_wb {
> + /*
> +  * Settings that govern how we throttle
> +  */
> + unsigned int wb_background; /* background writeback */
> + unsigned int wb_normal; /* normal writeback */
> + unsigned int wb_max;/* max throughput writeback */
> + unsigned int scale_step;
> +
> + u64 win_nsec;   /* default window size */
> + u64 cur_win_nsec;   /* current window size */
> +
> + unsigned int unknown_cnt;

It would be useful to have a comment here explaining that 'unknown_cnt' is
a number of consecutive periods in which we didn't have enough data to
decide about queue scaling (at least this is what I understood from the
code).

> +
> + struct timer_list window_timer;
> +
> + s64 sync_issue;
> + void *sync_cookie;

So I'm somewhat wondering: What is protecting consistency of this
structure? The limits, scale_step, cur_win_nsec, unknown_cnt are updated only
from timer so those should be safe. However sync_issue & sync_cookie are
accessed from IO submission and completion path and there we need some
protection to keep those two in sync. It seems q->queue_lock should mostly
achieve those except for blk-mq submission path calling wbt_wait() which
doesn't hold queue_lock.

It seems you were aware of the possible races and the code handles them
mostly fine (although I wouldn't bet too much there is not some weird
corner case). However it would be good to comment on this somewhere and
explain what the rules for these two fields are.

> +
> + unsigned int wc;
> + unsigned int queue_depth;
> +
> + unsigned long last_issue;   /* last non-throttled issue */
> + unsigned long last_comp;/* last non-throttled comp */
> + unsigned long min_lat_nsec;
> + struct backing_dev_info *bdi;
> + struct request_queue *q;
> + wait_queue_head_t wait;
> + atomic_t inflight;
> +
> + struct wb_stat_ops *stat_ops;
> + void *ops_data;
> +};
...
> diff --git a/lib/wbt.c b/lib/wbt.c
> new file mode 100644
> index ..650da911f24f
> --- /dev/null
> +++ b/lib/wbt.c
> @@ -0,0 +1,524 @@
> +/*
> + * buffered writeback throttling. losely based on CoDel. We can't drop
> + * packets for IO scheduling, so the logic is something like this:
> + *
> + * - Monitor latencies in a defined window of time.
> + * - If the minimum latency in the above window exceeds some target, 
> increment
> + *   scaling step and scale down queue depth by a factor of 2x. The 
> monitoring
> + *   window is then shrunk to 100 / sqrt(scaling step + 1).
> + * - For any window where we don't have solid data on what the latencies
> + *   look like, retain status quo.
> + * - If latencies look good, decrement scaling step.

I'm wondering about two things:

1) There is a logic somewhat in this direction in blk_queue_start_tag().
   Probably it should be removed after your patches land?

2) As far as I can see in patch 8/8, you have plugged the throttling above
   the IO scheduler. When there are e.g. multiple cgroups with different IO
   limits operating, this throttling can lead to strange results (like a
   cgroup with low limit using up all available background "slots" and thus
   effectively stopping background writeback for other cgroups)? So won't
   it make more sense to plug this below the IO scheduler? Now I understand
   there may be other problems with this but I think we should put more
   though to that and provide some justification in changelogs.

> +static void calc_wb_limits(struct rq_wb *rwb)
> +{
> + unsigned int depth;
> +
> + if (!rwb->min_lat_nsec) {
> + rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0;
> + return;
> + }
> +
> + depth = min_t(unsigned int, RWB_MAX_DEPTH, rwb->queue_depth);
> +
> + /*
> +  * Reduce max depth by 50%, and re-calculate normal/bg based on that
> +  */

The comment looks a bit out of place here since we don't reduce max depth
here. We just use whatever is set in scale_step...

> + rwb->wb_max = 1 + ((depth - 1) >> min(31U, 

Re: [PATCH 7/8] wbt: add general throttling mechanism

2016-04-27 Thread xiakaixu
于 2016/4/27 23:21, Jens Axboe 写道:
> On 04/27/2016 06:06 AM, xiakaixu wrote:
>>> +void __wbt_done(struct rq_wb *rwb)
>>> +{
>>> +int inflight, limit = rwb->wb_normal;
>>> +
>>> +/*
>>> + * If the device does write back caching, drop further down
>>> + * before we wake people up.
>>> + */
>>> +if (rwb->wc && !atomic_read(>bdi->wb.dirty_sleeping))
>>> +limit = 0;
>>> +else
>>> +limit = rwb->wb_normal;
>>> +
>>> +/*
>>> + * Don't wake anyone up if we are above the normal limit. If
>>> + * throttling got disabled (limit == 0) with waiters, ensure
>>> + * that we wake them up.
>>> + */
>>> +inflight = atomic_dec_return(>inflight);
>>> +if (limit && inflight >= limit) {
>>> +if (!rwb->wb_max)
>>> +wake_up_all(>wait);
>>> +return;
>>> +}
>>> +
>> Hi Jens,
>>
>> Just a little confused about this. The rwb->wb_max can't be 0 if the variable
>> 'limit' does not equal to 0. So the if (!rwb->wb_max) branch maybe does not
>> make sense.
> 
> You are right, it doesn't make a lot of sense. I think it suffers from code 
> shuffling. How about the attached? The important part is that we wake up 
> waiters, if wbt got disabled while we had tracked IO in flight.
>
Hi Jens,

The modified patch in another mail looks better. Maybe there are still
some places coube be improved. You can find them in that mail.



-- 
Regards
Kaixu Xia



Re: [PATCH 7/8] wbt: add general throttling mechanism

2016-04-27 Thread xiakaixu
于 2016/4/27 23:21, Jens Axboe 写道:
> On 04/27/2016 06:06 AM, xiakaixu wrote:
>>> +void __wbt_done(struct rq_wb *rwb)
>>> +{
>>> +int inflight, limit = rwb->wb_normal;
>>> +
>>> +/*
>>> + * If the device does write back caching, drop further down
>>> + * before we wake people up.
>>> + */
>>> +if (rwb->wc && !atomic_read(>bdi->wb.dirty_sleeping))
>>> +limit = 0;
>>> +else
>>> +limit = rwb->wb_normal;
>>> +
>>> +/*
>>> + * Don't wake anyone up if we are above the normal limit. If
>>> + * throttling got disabled (limit == 0) with waiters, ensure
>>> + * that we wake them up.
>>> + */
>>> +inflight = atomic_dec_return(>inflight);
>>> +if (limit && inflight >= limit) {
>>> +if (!rwb->wb_max)
>>> +wake_up_all(>wait);
>>> +return;
>>> +}
>>> +
>> Hi Jens,
>>
>> Just a little confused about this. The rwb->wb_max can't be 0 if the variable
>> 'limit' does not equal to 0. So the if (!rwb->wb_max) branch maybe does not
>> make sense.
> 
> You are right, it doesn't make a lot of sense. I think it suffers from code 
> shuffling. How about the attached? The important part is that we wake up 
> waiters, if wbt got disabled while we had tracked IO in flight.
>
Hi Jens,

The modified patch in another mail looks better. Maybe there are still
some places coube be improved. You can find them in that mail.



-- 
Regards
Kaixu Xia



Re: [PATCH 7/8] wbt: add general throttling mechanism

2016-04-27 Thread Jens Axboe

On 04/27/2016 06:06 AM, xiakaixu wrote:

+void __wbt_done(struct rq_wb *rwb)
+{
+   int inflight, limit = rwb->wb_normal;
+
+   /*
+* If the device does write back caching, drop further down
+* before we wake people up.
+*/
+   if (rwb->wc && !atomic_read(>bdi->wb.dirty_sleeping))
+   limit = 0;
+   else
+   limit = rwb->wb_normal;
+
+   /*
+* Don't wake anyone up if we are above the normal limit. If
+* throttling got disabled (limit == 0) with waiters, ensure
+* that we wake them up.
+*/
+   inflight = atomic_dec_return(>inflight);
+   if (limit && inflight >= limit) {
+   if (!rwb->wb_max)
+   wake_up_all(>wait);
+   return;
+   }
+

Hi Jens,

Just a little confused about this. The rwb->wb_max can't be 0 if the variable
'limit' does not equal to 0. So the if (!rwb->wb_max) branch maybe does not
make sense.


You are right, it doesn't make a lot of sense. I think it suffers from 
code shuffling. How about the attached? The important part is that we 
wake up waiters, if wbt got disabled while we had tracked IO in flight.


--
Jens Axboe

diff --git a/lib/wbt.c b/lib/wbt.c
index 650da911f24f..a6b80c135510 100644
--- a/lib/wbt.c
+++ b/lib/wbt.c
@@ -98,18 +98,23 @@ void __wbt_done(struct rq_wb *rwb)
 	else
 		limit = rwb->wb_normal;
 
+	inflight = atomic_dec_return(>inflight);
+
 	/*
-	 * Don't wake anyone up if we are above the normal limit. If
-	 * throttling got disabled (limit == 0) with waiters, ensure
-	 * that we wake them up.
+	 * wbt got disabled with IO in flight. Wake up any potential
+	 * waiters, we don't have to do more than that.
 	 */
-	inflight = atomic_dec_return(>inflight);
-	if (limit && inflight >= limit) {
-		if (!rwb->wb_max)
-			wake_up_all(>wait);
+	if (!rwb_enabled(rwb)) {
+		wake_up_all(>wait);
 		return;
 	}
 
+	/*
+	 * Don't wake anyone up if we are above the normal limit.
+	 */
+	if (inflight >= limit)
+		return;
+
 	if (waitqueue_active(>wait)) {
 		int diff = limit - inflight;
 


Re: [PATCH 7/8] wbt: add general throttling mechanism

2016-04-27 Thread Jens Axboe

On 04/27/2016 06:06 AM, xiakaixu wrote:

+void __wbt_done(struct rq_wb *rwb)
+{
+   int inflight, limit = rwb->wb_normal;
+
+   /*
+* If the device does write back caching, drop further down
+* before we wake people up.
+*/
+   if (rwb->wc && !atomic_read(>bdi->wb.dirty_sleeping))
+   limit = 0;
+   else
+   limit = rwb->wb_normal;
+
+   /*
+* Don't wake anyone up if we are above the normal limit. If
+* throttling got disabled (limit == 0) with waiters, ensure
+* that we wake them up.
+*/
+   inflight = atomic_dec_return(>inflight);
+   if (limit && inflight >= limit) {
+   if (!rwb->wb_max)
+   wake_up_all(>wait);
+   return;
+   }
+

Hi Jens,

Just a little confused about this. The rwb->wb_max can't be 0 if the variable
'limit' does not equal to 0. So the if (!rwb->wb_max) branch maybe does not
make sense.


You are right, it doesn't make a lot of sense. I think it suffers from 
code shuffling. How about the attached? The important part is that we 
wake up waiters, if wbt got disabled while we had tracked IO in flight.


--
Jens Axboe

diff --git a/lib/wbt.c b/lib/wbt.c
index 650da911f24f..a6b80c135510 100644
--- a/lib/wbt.c
+++ b/lib/wbt.c
@@ -98,18 +98,23 @@ void __wbt_done(struct rq_wb *rwb)
 	else
 		limit = rwb->wb_normal;
 
+	inflight = atomic_dec_return(>inflight);
+
 	/*
-	 * Don't wake anyone up if we are above the normal limit. If
-	 * throttling got disabled (limit == 0) with waiters, ensure
-	 * that we wake them up.
+	 * wbt got disabled with IO in flight. Wake up any potential
+	 * waiters, we don't have to do more than that.
 	 */
-	inflight = atomic_dec_return(>inflight);
-	if (limit && inflight >= limit) {
-		if (!rwb->wb_max)
-			wake_up_all(>wait);
+	if (!rwb_enabled(rwb)) {
+		wake_up_all(>wait);
 		return;
 	}
 
+	/*
+	 * Don't wake anyone up if we are above the normal limit.
+	 */
+	if (inflight >= limit)
+		return;
+
 	if (waitqueue_active(>wait)) {
 		int diff = limit - inflight;
 


Re: [PATCH 7/8] wbt: add general throttling mechanism

2016-04-27 Thread xiakaixu

> + return rwb && rwb->wb_normal != 0;
> +}
> +
> +/*
> + * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded,
> + * false if 'v' + 1 would be bigger than 'below'.
> + */
> +static bool atomic_inc_below(atomic_t *v, int below)
> +{
> + int cur = atomic_read(v);
> +
> + for (;;) {
> + int old;
> +
> + if (cur >= below)
> + return false;
> + old = atomic_cmpxchg(v, cur, cur + 1);
> + if (old == cur)
> + break;
> + cur = old;
> + }
> +
> + return true;
> +}
> +
> +static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
> +{
> + if (rwb_enabled(rwb)) {
> + const unsigned long cur = jiffies;
> +
> + if (cur != *var)
> + *var = cur;
> + }
> +}
> +
> +void __wbt_done(struct rq_wb *rwb)
> +{
> + int inflight, limit = rwb->wb_normal;
> +
> + /*
> +  * If the device does write back caching, drop further down
> +  * before we wake people up.
> +  */
> + if (rwb->wc && !atomic_read(>bdi->wb.dirty_sleeping))
> + limit = 0;
> + else
> + limit = rwb->wb_normal;
> +
> + /*
> +  * Don't wake anyone up if we are above the normal limit. If
> +  * throttling got disabled (limit == 0) with waiters, ensure
> +  * that we wake them up.
> +  */
> + inflight = atomic_dec_return(>inflight);
> + if (limit && inflight >= limit) {
> + if (!rwb->wb_max)
> + wake_up_all(>wait);
> + return;
> + }
> +
Hi Jens,

Just a little confused about this. The rwb->wb_max can't be 0 if the variable
'limit' does not equal to 0. So the if (!rwb->wb_max) branch maybe does not
make sense.


> + if (waitqueue_active(>wait)) {
> + int diff = limit - inflight;
> +
> + if (!inflight || diff >= rwb->wb_background / 2)
> + wake_up_nr(>wait, 1);
> + }
> +}
> +
> +/*
> + * Called on completion of a request. Note that it's also called when
> + * a request is merged, when the request gets freed.
> + */
> +void wbt_done(struct rq_wb *rwb, struct wb_issue_stat *stat)
> +{
> + if (!rwb)
> + return;
> +
> + if (!wbt_tracked(stat)) {
> + if (rwb->sync_cookie == stat) {
> + rwb->sync_issue = 0;
> + rwb->sync_cookie = NULL;
> + }
> +
> + wb_timestamp(rwb, >last_comp);
> + } else {
> + WARN_ON_ONCE(stat == rwb->sync_cookie);
> + __wbt_done(rwb);
> + wbt_clear_tracked(stat);
> + }
> +}
> +
> +static void calc_wb_limits(struct rq_wb *rwb)
> +{
> + unsigned int depth;
> +
> + if (!rwb->min_lat_nsec) {
> + rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0;
> + return;
> + }
> +
> + depth = min_t(unsigned int, RWB_MAX_DEPTH, rwb->queue_depth);
> +
> + /*
> +  * Reduce max depth by 50%, and re-calculate normal/bg based on that
> +  */
> + rwb->wb_max = 1 + ((depth - 1) >> min(31U, rwb->scale_step));
> + rwb->wb_normal = (rwb->wb_max + 1) / 2;
> + rwb->wb_background = (rwb->wb_max + 3) / 4;
> +}
> +
> +static bool inline stat_sample_valid(struct blk_rq_stat *stat)
> +{
> + /*
> +  * We need at least one read sample, and a minimum of
> +  * RWB_MIN_WRITE_SAMPLES. We require some write samples to know
> +  * that it's writes impacting us, and not just some sole read on
> +  * a device that is in a lower power state.
> +  */
> + return stat[0].nr_samples >= 1 &&
> + stat[1].nr_samples >= RWB_MIN_WRITE_SAMPLES;
> +}
> +



Re: [PATCH 7/8] wbt: add general throttling mechanism

2016-04-27 Thread xiakaixu

> + return rwb && rwb->wb_normal != 0;
> +}
> +
> +/*
> + * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded,
> + * false if 'v' + 1 would be bigger than 'below'.
> + */
> +static bool atomic_inc_below(atomic_t *v, int below)
> +{
> + int cur = atomic_read(v);
> +
> + for (;;) {
> + int old;
> +
> + if (cur >= below)
> + return false;
> + old = atomic_cmpxchg(v, cur, cur + 1);
> + if (old == cur)
> + break;
> + cur = old;
> + }
> +
> + return true;
> +}
> +
> +static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
> +{
> + if (rwb_enabled(rwb)) {
> + const unsigned long cur = jiffies;
> +
> + if (cur != *var)
> + *var = cur;
> + }
> +}
> +
> +void __wbt_done(struct rq_wb *rwb)
> +{
> + int inflight, limit = rwb->wb_normal;
> +
> + /*
> +  * If the device does write back caching, drop further down
> +  * before we wake people up.
> +  */
> + if (rwb->wc && !atomic_read(>bdi->wb.dirty_sleeping))
> + limit = 0;
> + else
> + limit = rwb->wb_normal;
> +
> + /*
> +  * Don't wake anyone up if we are above the normal limit. If
> +  * throttling got disabled (limit == 0) with waiters, ensure
> +  * that we wake them up.
> +  */
> + inflight = atomic_dec_return(>inflight);
> + if (limit && inflight >= limit) {
> + if (!rwb->wb_max)
> + wake_up_all(>wait);
> + return;
> + }
> +
Hi Jens,

Just a little confused about this. The rwb->wb_max can't be 0 if the variable
'limit' does not equal to 0. So the if (!rwb->wb_max) branch maybe does not
make sense.


> + if (waitqueue_active(>wait)) {
> + int diff = limit - inflight;
> +
> + if (!inflight || diff >= rwb->wb_background / 2)
> + wake_up_nr(>wait, 1);
> + }
> +}
> +
> +/*
> + * Called on completion of a request. Note that it's also called when
> + * a request is merged, when the request gets freed.
> + */
> +void wbt_done(struct rq_wb *rwb, struct wb_issue_stat *stat)
> +{
> + if (!rwb)
> + return;
> +
> + if (!wbt_tracked(stat)) {
> + if (rwb->sync_cookie == stat) {
> + rwb->sync_issue = 0;
> + rwb->sync_cookie = NULL;
> + }
> +
> + wb_timestamp(rwb, >last_comp);
> + } else {
> + WARN_ON_ONCE(stat == rwb->sync_cookie);
> + __wbt_done(rwb);
> + wbt_clear_tracked(stat);
> + }
> +}
> +
> +static void calc_wb_limits(struct rq_wb *rwb)
> +{
> + unsigned int depth;
> +
> + if (!rwb->min_lat_nsec) {
> + rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0;
> + return;
> + }
> +
> + depth = min_t(unsigned int, RWB_MAX_DEPTH, rwb->queue_depth);
> +
> + /*
> +  * Reduce max depth by 50%, and re-calculate normal/bg based on that
> +  */
> + rwb->wb_max = 1 + ((depth - 1) >> min(31U, rwb->scale_step));
> + rwb->wb_normal = (rwb->wb_max + 1) / 2;
> + rwb->wb_background = (rwb->wb_max + 3) / 4;
> +}
> +
> +static bool inline stat_sample_valid(struct blk_rq_stat *stat)
> +{
> + /*
> +  * We need at least one read sample, and a minimum of
> +  * RWB_MIN_WRITE_SAMPLES. We require some write samples to know
> +  * that it's writes impacting us, and not just some sole read on
> +  * a device that is in a lower power state.
> +  */
> + return stat[0].nr_samples >= 1 &&
> + stat[1].nr_samples >= RWB_MIN_WRITE_SAMPLES;
> +}
> +



[PATCH 7/8] wbt: add general throttling mechanism

2016-04-26 Thread Jens Axboe
We can hook this up to the block layer, to help throttle buffered
writes. Or NFS can tap into it, to accomplish the same.

wbt registers a few trace points that can be used to track what is
happening in the system:

wbt_lat: 259:0: latency 2446318
wbt_stat: 259:0: rmean=2446318, rmin=2446318, rmax=2446318, rsamples=1,
   wmean=518866, wmin=15522, wmax=5330353, wsamples=57
wbt_step: 259:0: step down: step=1, window=72727272, background=8, normal=16, 
max=32

This shows a sync issue event (wbt_lat) that exceeded it's time. wbt_stat
dumps the current read/write stats for that window, and wbt_step shows a
step down event where we now scale back writes. Each trace includes the
device, 259:0 in this case.

Signed-off-by: Jens Axboe 
---
 include/linux/wbt.h|  95 
 include/trace/events/wbt.h | 122 +++
 lib/Kconfig|   3 +
 lib/Makefile   |   1 +
 lib/wbt.c  | 524 +
 5 files changed, 745 insertions(+)
 create mode 100644 include/linux/wbt.h
 create mode 100644 include/trace/events/wbt.h
 create mode 100644 lib/wbt.c

diff --git a/include/linux/wbt.h b/include/linux/wbt.h
new file mode 100644
index ..c8a12795416b
--- /dev/null
+++ b/include/linux/wbt.h
@@ -0,0 +1,95 @@
+#ifndef WB_THROTTLE_H
+#define WB_THROTTLE_H
+
+#include 
+#include 
+#include 
+#include 
+
+#define ISSUE_STAT_MASK(1ULL << 63)
+#define ISSUE_STAT_TIME_MASK   ~ISSUE_STAT_MASK
+
+struct wb_issue_stat {
+   u64 time;
+};
+
+static inline void wbt_issue_stat_set_time(struct wb_issue_stat *stat)
+{
+   stat->time = (stat->time & ISSUE_STAT_MASK) |
+   (ktime_to_ns(ktime_get()) & ISSUE_STAT_TIME_MASK);
+}
+
+static inline u64 wbt_issue_stat_get_time(struct wb_issue_stat *stat)
+{
+   return stat->time & ISSUE_STAT_TIME_MASK;
+}
+
+static inline void wbt_mark_tracked(struct wb_issue_stat *stat)
+{
+   stat->time |= ISSUE_STAT_MASK;
+}
+
+static inline void wbt_clear_tracked(struct wb_issue_stat *stat)
+{
+   stat->time &= ~ISSUE_STAT_MASK;
+}
+
+static inline bool wbt_tracked(struct wb_issue_stat *stat)
+{
+   return (stat->time & ISSUE_STAT_MASK) != 0;
+}
+
+struct wb_stat_ops {
+   void (*get)(void *, struct blk_rq_stat *);
+   void (*clear)(void *);
+};
+
+struct rq_wb {
+   /*
+* Settings that govern how we throttle
+*/
+   unsigned int wb_background; /* background writeback */
+   unsigned int wb_normal; /* normal writeback */
+   unsigned int wb_max;/* max throughput writeback */
+   unsigned int scale_step;
+
+   u64 win_nsec;   /* default window size */
+   u64 cur_win_nsec;   /* current window size */
+
+   unsigned int unknown_cnt;
+
+   struct timer_list window_timer;
+
+   s64 sync_issue;
+   void *sync_cookie;
+
+   unsigned int wc;
+   unsigned int queue_depth;
+
+   unsigned long last_issue;   /* last non-throttled issue */
+   unsigned long last_comp;/* last non-throttled comp */
+   unsigned long min_lat_nsec;
+   struct backing_dev_info *bdi;
+   struct request_queue *q;
+   wait_queue_head_t wait;
+   atomic_t inflight;
+
+   struct wb_stat_ops *stat_ops;
+   void *ops_data;
+};
+
+struct backing_dev_info;
+
+void __wbt_done(struct rq_wb *);
+void wbt_done(struct rq_wb *, struct wb_issue_stat *);
+bool wbt_wait(struct rq_wb *, unsigned int, spinlock_t *);
+struct rq_wb *wbt_init(struct backing_dev_info *, struct wb_stat_ops *, void 
*);
+void wbt_exit(struct rq_wb *);
+void wbt_update_limits(struct rq_wb *);
+void wbt_requeue(struct rq_wb *, struct wb_issue_stat *);
+void wbt_issue(struct rq_wb *, struct wb_issue_stat *);
+
+void wbt_set_queue_depth(struct rq_wb *, unsigned int);
+void wbt_set_write_cache(struct rq_wb *, bool);
+
+#endif
diff --git a/include/trace/events/wbt.h b/include/trace/events/wbt.h
new file mode 100644
index ..a4b8b2e57bb1
--- /dev/null
+++ b/include/trace/events/wbt.h
@@ -0,0 +1,122 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM wbt
+
+#if !defined(_TRACE_WBT_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_WBT_H
+
+#include 
+#include 
+
+/**
+ * wbt_stat - trace stats for blk_wb
+ * @stat: array of read/write stats
+ */
+TRACE_EVENT(wbt_stat,
+
+   TP_PROTO(struct backing_dev_info *bdi, struct blk_rq_stat *stat),
+
+   TP_ARGS(bdi, stat),
+
+   TP_STRUCT__entry(
+   __array(char, name, 32)
+   __field(s64, rmean)
+   __field(u64, rmin)
+   __field(u64, rmax)
+   __field(s64, rnr_samples)
+   __field(s64, rtime)
+   __field(s64, wmean)
+   __field(u64, wmin)
+   __field(u64, wmax)
+   __field(s64, wnr_samples)
+   

[PATCH 7/8] wbt: add general throttling mechanism

2016-04-26 Thread Jens Axboe
We can hook this up to the block layer, to help throttle buffered
writes. Or NFS can tap into it, to accomplish the same.

wbt registers a few trace points that can be used to track what is
happening in the system:

wbt_lat: 259:0: latency 2446318
wbt_stat: 259:0: rmean=2446318, rmin=2446318, rmax=2446318, rsamples=1,
   wmean=518866, wmin=15522, wmax=5330353, wsamples=57
wbt_step: 259:0: step down: step=1, window=72727272, background=8, normal=16, 
max=32

This shows a sync issue event (wbt_lat) that exceeded it's time. wbt_stat
dumps the current read/write stats for that window, and wbt_step shows a
step down event where we now scale back writes. Each trace includes the
device, 259:0 in this case.

Signed-off-by: Jens Axboe 
---
 include/linux/wbt.h|  95 
 include/trace/events/wbt.h | 122 +++
 lib/Kconfig|   3 +
 lib/Makefile   |   1 +
 lib/wbt.c  | 524 +
 5 files changed, 745 insertions(+)
 create mode 100644 include/linux/wbt.h
 create mode 100644 include/trace/events/wbt.h
 create mode 100644 lib/wbt.c

diff --git a/include/linux/wbt.h b/include/linux/wbt.h
new file mode 100644
index ..c8a12795416b
--- /dev/null
+++ b/include/linux/wbt.h
@@ -0,0 +1,95 @@
+#ifndef WB_THROTTLE_H
+#define WB_THROTTLE_H
+
+#include 
+#include 
+#include 
+#include 
+
+#define ISSUE_STAT_MASK(1ULL << 63)
+#define ISSUE_STAT_TIME_MASK   ~ISSUE_STAT_MASK
+
+struct wb_issue_stat {
+   u64 time;
+};
+
+static inline void wbt_issue_stat_set_time(struct wb_issue_stat *stat)
+{
+   stat->time = (stat->time & ISSUE_STAT_MASK) |
+   (ktime_to_ns(ktime_get()) & ISSUE_STAT_TIME_MASK);
+}
+
+static inline u64 wbt_issue_stat_get_time(struct wb_issue_stat *stat)
+{
+   return stat->time & ISSUE_STAT_TIME_MASK;
+}
+
+static inline void wbt_mark_tracked(struct wb_issue_stat *stat)
+{
+   stat->time |= ISSUE_STAT_MASK;
+}
+
+static inline void wbt_clear_tracked(struct wb_issue_stat *stat)
+{
+   stat->time &= ~ISSUE_STAT_MASK;
+}
+
+static inline bool wbt_tracked(struct wb_issue_stat *stat)
+{
+   return (stat->time & ISSUE_STAT_MASK) != 0;
+}
+
+struct wb_stat_ops {
+   void (*get)(void *, struct blk_rq_stat *);
+   void (*clear)(void *);
+};
+
+struct rq_wb {
+   /*
+* Settings that govern how we throttle
+*/
+   unsigned int wb_background; /* background writeback */
+   unsigned int wb_normal; /* normal writeback */
+   unsigned int wb_max;/* max throughput writeback */
+   unsigned int scale_step;
+
+   u64 win_nsec;   /* default window size */
+   u64 cur_win_nsec;   /* current window size */
+
+   unsigned int unknown_cnt;
+
+   struct timer_list window_timer;
+
+   s64 sync_issue;
+   void *sync_cookie;
+
+   unsigned int wc;
+   unsigned int queue_depth;
+
+   unsigned long last_issue;   /* last non-throttled issue */
+   unsigned long last_comp;/* last non-throttled comp */
+   unsigned long min_lat_nsec;
+   struct backing_dev_info *bdi;
+   struct request_queue *q;
+   wait_queue_head_t wait;
+   atomic_t inflight;
+
+   struct wb_stat_ops *stat_ops;
+   void *ops_data;
+};
+
+struct backing_dev_info;
+
+void __wbt_done(struct rq_wb *);
+void wbt_done(struct rq_wb *, struct wb_issue_stat *);
+bool wbt_wait(struct rq_wb *, unsigned int, spinlock_t *);
+struct rq_wb *wbt_init(struct backing_dev_info *, struct wb_stat_ops *, void 
*);
+void wbt_exit(struct rq_wb *);
+void wbt_update_limits(struct rq_wb *);
+void wbt_requeue(struct rq_wb *, struct wb_issue_stat *);
+void wbt_issue(struct rq_wb *, struct wb_issue_stat *);
+
+void wbt_set_queue_depth(struct rq_wb *, unsigned int);
+void wbt_set_write_cache(struct rq_wb *, bool);
+
+#endif
diff --git a/include/trace/events/wbt.h b/include/trace/events/wbt.h
new file mode 100644
index ..a4b8b2e57bb1
--- /dev/null
+++ b/include/trace/events/wbt.h
@@ -0,0 +1,122 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM wbt
+
+#if !defined(_TRACE_WBT_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_WBT_H
+
+#include 
+#include 
+
+/**
+ * wbt_stat - trace stats for blk_wb
+ * @stat: array of read/write stats
+ */
+TRACE_EVENT(wbt_stat,
+
+   TP_PROTO(struct backing_dev_info *bdi, struct blk_rq_stat *stat),
+
+   TP_ARGS(bdi, stat),
+
+   TP_STRUCT__entry(
+   __array(char, name, 32)
+   __field(s64, rmean)
+   __field(u64, rmin)
+   __field(u64, rmax)
+   __field(s64, rnr_samples)
+   __field(s64, rtime)
+   __field(s64, wmean)
+   __field(u64, wmin)
+   __field(u64, wmax)
+   __field(s64, wnr_samples)
+   __field(s64,