[PATCH 8/8] writeback: throttle buffered writeback

2016-09-07 Thread Jens Axboe
Test patch that throttles buffered writeback to make it a lot
more smooth, and has way less impact on other system activity.
Background writeback should be, by definition, background
activity. The fact that we flush huge bundles of it at the time
means that it potentially has heavy impacts on foreground workloads,
which isn't ideal. We can't easily limit the sizes of writes that
we do, since that would impact file system layout in the presence
of delayed allocation. So just throttle back buffered writeback,
unless someone is waiting for it.

The algorithm for when to throttle takes its inspiration in the
CoDel networking scheduling algorithm. Like CoDel, blk-wb monitors
the minimum latencies of requests over a window of time. In that
window of time, if the minimum latency of any request exceeds a
given target, then a scale count is incremented and the queue depth
is shrunk. The next monitoring window is shrunk accordingly. Unlike
CoDel, if we hit a window that exhibits good behavior, then we
simply increment the scale count and re-calculate the limits for that
scale value. This prevents us from oscillating between a
close-to-ideal value and max all the time, instead remaining in the
windows where we get good behavior.

Unlike CoDel, blk-wb allows the scale count to to negative. This
happens if we primarily have writes going on. Unlike positive
scale counts, this doesn't change the size of the monitoring window.
When the heavy writers finish, blk-bw quickly snaps back to it's
stable state of a zero scale count.

The patch registers two sysfs entries. The first one, 'wb_window_usec',
defines the window of monitoring. The second one, 'wb_lat_usec',
sets the latency target for the window. It defaults to 2 msec for
non-rotational storage, and 75 msec for rotational storage. Setting
this value to '0' disables blk-wb. Generally, a user would not have
to touch these settings.

We don't enable WBT on devices that are managed with CFQ, and have
a non-root block cgroup attached. If we have a proportional share setup
on this particular disk, then the wbt throttling will interfere with
that. We don't have a strong need for wbt for that case, since we will
rely on CFQ doing that for us.

Signed-off-by: Jens Axboe 
---
 Documentation/block/queue-sysfs.txt |  13 
 block/Kconfig   |   1 +
 block/blk-core.c|  20 +-
 block/blk-mq.c  |  30 -
 block/blk-settings.c|   3 +
 block/blk-stat.c|   5 +-
 block/blk-sysfs.c   | 125 
 block/cfq-iosched.c |  12 
 include/linux/blkdev.h  |   6 +-
 9 files changed, 206 insertions(+), 9 deletions(-)

diff --git a/Documentation/block/queue-sysfs.txt 
b/Documentation/block/queue-sysfs.txt
index 2a3904030dea..2847219ebd8c 100644
--- a/Documentation/block/queue-sysfs.txt
+++ b/Documentation/block/queue-sysfs.txt
@@ -169,5 +169,18 @@ This is the number of bytes the device can write in a 
single write-same
 command.  A value of '0' means write-same is not supported by this
 device.
 
+wb_lat_usec (RW)
+
+If the device is registered for writeback throttling, then this file shows
+the target minimum read latency. If this latency is exceeded in a given
+window of time (see wb_window_usec), then the writeback throttling will start
+scaling back writes.
+
+wb_window_usec (RW)
+---
+If the device is registered for writeback throttling, then this file shows
+the value of the monitoring window in which we'll look at the target
+latency. See wb_lat_usec.
+
 
 Jens Axboe , February 2009
diff --git a/block/Kconfig b/block/Kconfig
index 161491d0a879..6da79e670709 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -4,6 +4,7 @@
 menuconfig BLOCK
bool "Enable the block layer" if EXPERT
default y
+   select WBT
help
 Provide block layer support for the kernel.
 
diff --git a/block/blk-core.c b/block/blk-core.c
index 4075cbeb720e..4f4ce050290c 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -33,6 +33,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define CREATE_TRACE_POINTS
 #include 
@@ -882,6 +883,8 @@ blk_init_allocated_queue(struct request_queue *q, 
request_fn_proc *rfn,
 
 fail:
blk_free_flush_queue(q->fq);
+   wbt_exit(q->rq_wb);
+   q->rq_wb = NULL;
return NULL;
 }
 EXPORT_SYMBOL(blk_init_allocated_queue);
@@ -1346,6 +1349,7 @@ void blk_requeue_request(struct request_queue *q, struct 
request *rq)
blk_delete_timer(rq);
blk_clear_rq_complete(rq);
trace_block_rq_requeue(q, rq);
+   wbt_requeue(q->rq_wb, >wb_stat);
 
if (rq->cmd_flags & REQ_QUEUED)
blk_queue_end_tag(q, rq);
@@ -1436,6 +1440,8 @@ void __blk_put_request(struct request_queue *q, struct 
request *req)
/* this is a bio leak */
WARN_ON(req->bio != 

[PATCH 8/8] writeback: throttle buffered writeback

2016-09-07 Thread Jens Axboe
Test patch that throttles buffered writeback to make it a lot
more smooth, and has way less impact on other system activity.
Background writeback should be, by definition, background
activity. The fact that we flush huge bundles of it at the time
means that it potentially has heavy impacts on foreground workloads,
which isn't ideal. We can't easily limit the sizes of writes that
we do, since that would impact file system layout in the presence
of delayed allocation. So just throttle back buffered writeback,
unless someone is waiting for it.

The algorithm for when to throttle takes its inspiration in the
CoDel networking scheduling algorithm. Like CoDel, blk-wb monitors
the minimum latencies of requests over a window of time. In that
window of time, if the minimum latency of any request exceeds a
given target, then a scale count is incremented and the queue depth
is shrunk. The next monitoring window is shrunk accordingly. Unlike
CoDel, if we hit a window that exhibits good behavior, then we
simply increment the scale count and re-calculate the limits for that
scale value. This prevents us from oscillating between a
close-to-ideal value and max all the time, instead remaining in the
windows where we get good behavior.

Unlike CoDel, blk-wb allows the scale count to to negative. This
happens if we primarily have writes going on. Unlike positive
scale counts, this doesn't change the size of the monitoring window.
When the heavy writers finish, blk-bw quickly snaps back to it's
stable state of a zero scale count.

The patch registers two sysfs entries. The first one, 'wb_window_usec',
defines the window of monitoring. The second one, 'wb_lat_usec',
sets the latency target for the window. It defaults to 2 msec for
non-rotational storage, and 75 msec for rotational storage. Setting
this value to '0' disables blk-wb. Generally, a user would not have
to touch these settings.

We don't enable WBT on devices that are managed with CFQ, and have
a non-root block cgroup attached. If we have a proportional share setup
on this particular disk, then the wbt throttling will interfere with
that. We don't have a strong need for wbt for that case, since we will
rely on CFQ doing that for us.

Signed-off-by: Jens Axboe 
---
 Documentation/block/queue-sysfs.txt |  13 
 block/Kconfig   |   1 +
 block/blk-core.c|  20 +-
 block/blk-mq.c  |  30 -
 block/blk-settings.c|   3 +
 block/blk-stat.c|   5 +-
 block/blk-sysfs.c   | 125 
 block/cfq-iosched.c |  12 
 include/linux/blkdev.h  |   6 +-
 9 files changed, 206 insertions(+), 9 deletions(-)

diff --git a/Documentation/block/queue-sysfs.txt 
b/Documentation/block/queue-sysfs.txt
index 2a3904030dea..2847219ebd8c 100644
--- a/Documentation/block/queue-sysfs.txt
+++ b/Documentation/block/queue-sysfs.txt
@@ -169,5 +169,18 @@ This is the number of bytes the device can write in a 
single write-same
 command.  A value of '0' means write-same is not supported by this
 device.
 
+wb_lat_usec (RW)
+
+If the device is registered for writeback throttling, then this file shows
+the target minimum read latency. If this latency is exceeded in a given
+window of time (see wb_window_usec), then the writeback throttling will start
+scaling back writes.
+
+wb_window_usec (RW)
+---
+If the device is registered for writeback throttling, then this file shows
+the value of the monitoring window in which we'll look at the target
+latency. See wb_lat_usec.
+
 
 Jens Axboe , February 2009
diff --git a/block/Kconfig b/block/Kconfig
index 161491d0a879..6da79e670709 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -4,6 +4,7 @@
 menuconfig BLOCK
bool "Enable the block layer" if EXPERT
default y
+   select WBT
help
 Provide block layer support for the kernel.
 
diff --git a/block/blk-core.c b/block/blk-core.c
index 4075cbeb720e..4f4ce050290c 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -33,6 +33,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define CREATE_TRACE_POINTS
 #include 
@@ -882,6 +883,8 @@ blk_init_allocated_queue(struct request_queue *q, 
request_fn_proc *rfn,
 
 fail:
blk_free_flush_queue(q->fq);
+   wbt_exit(q->rq_wb);
+   q->rq_wb = NULL;
return NULL;
 }
 EXPORT_SYMBOL(blk_init_allocated_queue);
@@ -1346,6 +1349,7 @@ void blk_requeue_request(struct request_queue *q, struct 
request *rq)
blk_delete_timer(rq);
blk_clear_rq_complete(rq);
trace_block_rq_requeue(q, rq);
+   wbt_requeue(q->rq_wb, >wb_stat);
 
if (rq->cmd_flags & REQ_QUEUED)
blk_queue_end_tag(q, rq);
@@ -1436,6 +1440,8 @@ void __blk_put_request(struct request_queue *q, struct 
request *req)
/* this is a bio leak */
WARN_ON(req->bio != NULL);
 
+   wbt_done(q->rq_wb, 

[PATCH 8/8] writeback: throttle buffered writeback

2016-08-31 Thread Jens Axboe
Test patch that throttles buffered writeback to make it a lot
more smooth, and has way less impact on other system activity.
Background writeback should be, by definition, background
activity. The fact that we flush huge bundles of it at the time
means that it potentially has heavy impacts on foreground workloads,
which isn't ideal. We can't easily limit the sizes of writes that
we do, since that would impact file system layout in the presence
of delayed allocation. So just throttle back buffered writeback,
unless someone is waiting for it.

The algorithm for when to throttle takes its inspiration in the
CoDel networking scheduling algorithm. Like CoDel, blk-wb monitors
the minimum latencies of requests over a window of time. In that
window of time, if the minimum latency of any request exceeds a
given target, then a scale count is incremented and the queue depth
is shrunk. The next monitoring window is shrunk accordingly. Unlike
CoDel, if we hit a window that exhibits good behavior, then we
simply increment the scale count and re-calculate the limits for that
scale value. This prevents us from oscillating between a
close-to-ideal value and max all the time, instead remaining in the
windows where we get good behavior.

The patch registers two sysfs entries. The first one, 'wb_window_usec',
defines the window of monitoring. The second one, 'wb_lat_usec',
sets the latency target for the window. It defaults to 2 msec for
non-rotational storage, and 75 msec for rotational storage. Setting
this value to '0' disables blk-wb. Generally, a user would not have
to touch these settings.

We don't enable WBT on devices that are managed with CFQ, and have
a non-root block cgroup attached. If we have a proportional share setup
on this particular disk, then the wbt throttling will interfere with
that. We don't have a strong need for wbt for that case, since we will
rely on CFQ doing that for us.

Signed-off-by: Jens Axboe 
---
 Documentation/block/queue-sysfs.txt |  13 
 block/Kconfig   |   1 +
 block/blk-core.c|  20 +-
 block/blk-mq.c  |  30 -
 block/blk-settings.c|   3 +
 block/blk-stat.c|   5 +-
 block/blk-sysfs.c   | 119 
 block/cfq-iosched.c |  12 
 include/linux/blkdev.h  |   6 +-
 9 files changed, 200 insertions(+), 9 deletions(-)

diff --git a/Documentation/block/queue-sysfs.txt 
b/Documentation/block/queue-sysfs.txt
index 2a3904030dea..2847219ebd8c 100644
--- a/Documentation/block/queue-sysfs.txt
+++ b/Documentation/block/queue-sysfs.txt
@@ -169,5 +169,18 @@ This is the number of bytes the device can write in a 
single write-same
 command.  A value of '0' means write-same is not supported by this
 device.
 
+wb_lat_usec (RW)
+
+If the device is registered for writeback throttling, then this file shows
+the target minimum read latency. If this latency is exceeded in a given
+window of time (see wb_window_usec), then the writeback throttling will start
+scaling back writes.
+
+wb_window_usec (RW)
+---
+If the device is registered for writeback throttling, then this file shows
+the value of the monitoring window in which we'll look at the target
+latency. See wb_lat_usec.
+
 
 Jens Axboe , February 2009
diff --git a/block/Kconfig b/block/Kconfig
index 161491d0a879..6da79e670709 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -4,6 +4,7 @@
 menuconfig BLOCK
bool "Enable the block layer" if EXPERT
default y
+   select WBT
help
 Provide block layer support for the kernel.
 
diff --git a/block/blk-core.c b/block/blk-core.c
index 4075cbeb720e..4f4ce050290c 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -33,6 +33,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define CREATE_TRACE_POINTS
 #include 
@@ -882,6 +883,8 @@ blk_init_allocated_queue(struct request_queue *q, 
request_fn_proc *rfn,
 
 fail:
blk_free_flush_queue(q->fq);
+   wbt_exit(q->rq_wb);
+   q->rq_wb = NULL;
return NULL;
 }
 EXPORT_SYMBOL(blk_init_allocated_queue);
@@ -1346,6 +1349,7 @@ void blk_requeue_request(struct request_queue *q, struct 
request *rq)
blk_delete_timer(rq);
blk_clear_rq_complete(rq);
trace_block_rq_requeue(q, rq);
+   wbt_requeue(q->rq_wb, >wb_stat);
 
if (rq->cmd_flags & REQ_QUEUED)
blk_queue_end_tag(q, rq);
@@ -1436,6 +1440,8 @@ void __blk_put_request(struct request_queue *q, struct 
request *req)
/* this is a bio leak */
WARN_ON(req->bio != NULL);
 
+   wbt_done(q->rq_wb, >wb_stat);
+
/*
 * Request may not have originated from ll_rw_blk. if not,
 * it didn't come out of our reserved rq pools
@@ -1667,6 +1673,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, 
struct bio *bio)
int el_ret, 

[PATCH 8/8] writeback: throttle buffered writeback

2016-08-31 Thread Jens Axboe
Test patch that throttles buffered writeback to make it a lot
more smooth, and has way less impact on other system activity.
Background writeback should be, by definition, background
activity. The fact that we flush huge bundles of it at the time
means that it potentially has heavy impacts on foreground workloads,
which isn't ideal. We can't easily limit the sizes of writes that
we do, since that would impact file system layout in the presence
of delayed allocation. So just throttle back buffered writeback,
unless someone is waiting for it.

The algorithm for when to throttle takes its inspiration in the
CoDel networking scheduling algorithm. Like CoDel, blk-wb monitors
the minimum latencies of requests over a window of time. In that
window of time, if the minimum latency of any request exceeds a
given target, then a scale count is incremented and the queue depth
is shrunk. The next monitoring window is shrunk accordingly. Unlike
CoDel, if we hit a window that exhibits good behavior, then we
simply increment the scale count and re-calculate the limits for that
scale value. This prevents us from oscillating between a
close-to-ideal value and max all the time, instead remaining in the
windows where we get good behavior.

The patch registers two sysfs entries. The first one, 'wb_window_usec',
defines the window of monitoring. The second one, 'wb_lat_usec',
sets the latency target for the window. It defaults to 2 msec for
non-rotational storage, and 75 msec for rotational storage. Setting
this value to '0' disables blk-wb. Generally, a user would not have
to touch these settings.

We don't enable WBT on devices that are managed with CFQ, and have
a non-root block cgroup attached. If we have a proportional share setup
on this particular disk, then the wbt throttling will interfere with
that. We don't have a strong need for wbt for that case, since we will
rely on CFQ doing that for us.

Signed-off-by: Jens Axboe 
---
 Documentation/block/queue-sysfs.txt |  13 
 block/Kconfig   |   1 +
 block/blk-core.c|  20 +-
 block/blk-mq.c  |  30 -
 block/blk-settings.c|   3 +
 block/blk-stat.c|   5 +-
 block/blk-sysfs.c   | 119 
 block/cfq-iosched.c |  12 
 include/linux/blkdev.h  |   6 +-
 9 files changed, 200 insertions(+), 9 deletions(-)

diff --git a/Documentation/block/queue-sysfs.txt 
b/Documentation/block/queue-sysfs.txt
index 2a3904030dea..2847219ebd8c 100644
--- a/Documentation/block/queue-sysfs.txt
+++ b/Documentation/block/queue-sysfs.txt
@@ -169,5 +169,18 @@ This is the number of bytes the device can write in a 
single write-same
 command.  A value of '0' means write-same is not supported by this
 device.
 
+wb_lat_usec (RW)
+
+If the device is registered for writeback throttling, then this file shows
+the target minimum read latency. If this latency is exceeded in a given
+window of time (see wb_window_usec), then the writeback throttling will start
+scaling back writes.
+
+wb_window_usec (RW)
+---
+If the device is registered for writeback throttling, then this file shows
+the value of the monitoring window in which we'll look at the target
+latency. See wb_lat_usec.
+
 
 Jens Axboe , February 2009
diff --git a/block/Kconfig b/block/Kconfig
index 161491d0a879..6da79e670709 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -4,6 +4,7 @@
 menuconfig BLOCK
bool "Enable the block layer" if EXPERT
default y
+   select WBT
help
 Provide block layer support for the kernel.
 
diff --git a/block/blk-core.c b/block/blk-core.c
index 4075cbeb720e..4f4ce050290c 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -33,6 +33,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define CREATE_TRACE_POINTS
 #include 
@@ -882,6 +883,8 @@ blk_init_allocated_queue(struct request_queue *q, 
request_fn_proc *rfn,
 
 fail:
blk_free_flush_queue(q->fq);
+   wbt_exit(q->rq_wb);
+   q->rq_wb = NULL;
return NULL;
 }
 EXPORT_SYMBOL(blk_init_allocated_queue);
@@ -1346,6 +1349,7 @@ void blk_requeue_request(struct request_queue *q, struct 
request *rq)
blk_delete_timer(rq);
blk_clear_rq_complete(rq);
trace_block_rq_requeue(q, rq);
+   wbt_requeue(q->rq_wb, >wb_stat);
 
if (rq->cmd_flags & REQ_QUEUED)
blk_queue_end_tag(q, rq);
@@ -1436,6 +1440,8 @@ void __blk_put_request(struct request_queue *q, struct 
request *req)
/* this is a bio leak */
WARN_ON(req->bio != NULL);
 
+   wbt_done(q->rq_wb, >wb_stat);
+
/*
 * Request may not have originated from ll_rw_blk. if not,
 * it didn't come out of our reserved rq pools
@@ -1667,6 +1673,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, 
struct bio *bio)
int el_ret, rw_flags = 0, where = 

[PATCH 8/8] writeback: throttle buffered writeback

2016-04-26 Thread Jens Axboe
Test patch that throttles buffered writeback to make it a lot
more smooth, and has way less impact on other system activity.
Background writeback should be, by definition, background
activity. The fact that we flush huge bundles of it at the time
means that it potentially has heavy impacts on foreground workloads,
which isn't ideal. We can't easily limit the sizes of writes that
we do, since that would impact file system layout in the presence
of delayed allocation. So just throttle back buffered writeback,
unless someone is waiting for it.

The algorithm for when to throttle takes its inspiration in the
CoDel networking scheduling algorithm. Like CoDel, blk-wb monitors
the minimum latencies of requests over a window of time. In that
window of time, if the minimum latency of any request exceeds a
given target, then a scale count is incremented and the queue depth
is shrunk. The next monitoring window is shrunk accordingly. Unlike
CoDel, if we hit a window that exhibits good behavior, then we
simply increment the scale count and re-calculate the limits for that
scale value. This prevents us from oscillating between a
close-to-ideal value and max all the time, instead remaining in the
windows where we get good behavior.

The patch registers two sysfs entries. The first one, 'wb_window_usec',
defines the window of monitoring. The second one, 'wb_lat_usec',
sets the latency target for the window. It defaults to 2 msec for
non-rotational storage, and 75 msec for rotational storage. Setting
this value to '0' disables blk-wb. Generally, a user would not have
to touch these settings.

Signed-off-by: Jens Axboe 
---
 Documentation/block/queue-sysfs.txt |  13 
 block/Kconfig   |   1 +
 block/blk-core.c|  21 ++-
 block/blk-mq.c  |  32 +-
 block/blk-settings.c|   3 +
 block/blk-stat.c|   5 +-
 block/blk-sysfs.c   | 119 
 include/linux/blkdev.h  |   6 +-
 8 files changed, 191 insertions(+), 9 deletions(-)

diff --git a/Documentation/block/queue-sysfs.txt 
b/Documentation/block/queue-sysfs.txt
index dce25d848d92..9bc990abef4d 100644
--- a/Documentation/block/queue-sysfs.txt
+++ b/Documentation/block/queue-sysfs.txt
@@ -151,5 +151,18 @@ device state. This means that it might not be safe to 
toggle the
 setting from "write back" to "write through", since that will also
 eliminate cache flushes issued by the kernel.
 
+wb_lat_usec (RW)
+
+If the device is registered for writeback throttling, then this file shows
+the target minimum read latency. If this latency is exceeded in a given
+window of time (see wb_window_usec), then the writeback throttling will start
+scaling back writes.
+
+wb_window_usec (RW)
+---
+If the device is registered for writeback throttling, then this file shows
+the value of the monitoring window in which we'll look at the target
+latency. See wb_lat_usec.
+
 
 Jens Axboe , February 2009
diff --git a/block/Kconfig b/block/Kconfig
index 0363cd731320..d4c2ff4b9b2c 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -4,6 +4,7 @@
 menuconfig BLOCK
bool "Enable the block layer" if EXPERT
default y
+   select WBT
help
 Provide block layer support for the kernel.
 
diff --git a/block/blk-core.c b/block/blk-core.c
index 40b57bf4852c..c166d46a09d1 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -33,6 +33,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define CREATE_TRACE_POINTS
 #include 
@@ -880,6 +881,8 @@ blk_init_allocated_queue(struct request_queue *q, 
request_fn_proc *rfn,
 
 fail:
blk_free_flush_queue(q->fq);
+   wbt_exit(q->rq_wb);
+   q->rq_wb = NULL;
return NULL;
 }
 EXPORT_SYMBOL(blk_init_allocated_queue);
@@ -1395,6 +1398,7 @@ void blk_requeue_request(struct request_queue *q, struct 
request *rq)
blk_delete_timer(rq);
blk_clear_rq_complete(rq);
trace_block_rq_requeue(q, rq);
+   wbt_requeue(q->rq_wb, >wb_stat);
 
if (rq->cmd_flags & REQ_QUEUED)
blk_queue_end_tag(q, rq);
@@ -1485,6 +1489,8 @@ void __blk_put_request(struct request_queue *q, struct 
request *req)
/* this is a bio leak */
WARN_ON(req->bio != NULL);
 
+   wbt_done(q->rq_wb, >wb_stat);
+
/*
 * Request may not have originated from ll_rw_blk. if not,
 * it didn't come out of our reserved rq pools
@@ -1714,6 +1720,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, 
struct bio *bio)
int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
struct request *req;
unsigned int request_count = 0;
+   bool wb_acct;
 
/*
 * low level driver can indicate that it wants pages above a
@@ -1766,6 +1773,8 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, 
struct bio *bio)
}
 
 get_rq:

[PATCH 8/8] writeback: throttle buffered writeback

2016-04-26 Thread Jens Axboe
Test patch that throttles buffered writeback to make it a lot
more smooth, and has way less impact on other system activity.
Background writeback should be, by definition, background
activity. The fact that we flush huge bundles of it at the time
means that it potentially has heavy impacts on foreground workloads,
which isn't ideal. We can't easily limit the sizes of writes that
we do, since that would impact file system layout in the presence
of delayed allocation. So just throttle back buffered writeback,
unless someone is waiting for it.

The algorithm for when to throttle takes its inspiration in the
CoDel networking scheduling algorithm. Like CoDel, blk-wb monitors
the minimum latencies of requests over a window of time. In that
window of time, if the minimum latency of any request exceeds a
given target, then a scale count is incremented and the queue depth
is shrunk. The next monitoring window is shrunk accordingly. Unlike
CoDel, if we hit a window that exhibits good behavior, then we
simply increment the scale count and re-calculate the limits for that
scale value. This prevents us from oscillating between a
close-to-ideal value and max all the time, instead remaining in the
windows where we get good behavior.

The patch registers two sysfs entries. The first one, 'wb_window_usec',
defines the window of monitoring. The second one, 'wb_lat_usec',
sets the latency target for the window. It defaults to 2 msec for
non-rotational storage, and 75 msec for rotational storage. Setting
this value to '0' disables blk-wb. Generally, a user would not have
to touch these settings.

Signed-off-by: Jens Axboe 
---
 Documentation/block/queue-sysfs.txt |  13 
 block/Kconfig   |   1 +
 block/blk-core.c|  21 ++-
 block/blk-mq.c  |  32 +-
 block/blk-settings.c|   3 +
 block/blk-stat.c|   5 +-
 block/blk-sysfs.c   | 119 
 include/linux/blkdev.h  |   6 +-
 8 files changed, 191 insertions(+), 9 deletions(-)

diff --git a/Documentation/block/queue-sysfs.txt 
b/Documentation/block/queue-sysfs.txt
index dce25d848d92..9bc990abef4d 100644
--- a/Documentation/block/queue-sysfs.txt
+++ b/Documentation/block/queue-sysfs.txt
@@ -151,5 +151,18 @@ device state. This means that it might not be safe to 
toggle the
 setting from "write back" to "write through", since that will also
 eliminate cache flushes issued by the kernel.
 
+wb_lat_usec (RW)
+
+If the device is registered for writeback throttling, then this file shows
+the target minimum read latency. If this latency is exceeded in a given
+window of time (see wb_window_usec), then the writeback throttling will start
+scaling back writes.
+
+wb_window_usec (RW)
+---
+If the device is registered for writeback throttling, then this file shows
+the value of the monitoring window in which we'll look at the target
+latency. See wb_lat_usec.
+
 
 Jens Axboe , February 2009
diff --git a/block/Kconfig b/block/Kconfig
index 0363cd731320..d4c2ff4b9b2c 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -4,6 +4,7 @@
 menuconfig BLOCK
bool "Enable the block layer" if EXPERT
default y
+   select WBT
help
 Provide block layer support for the kernel.
 
diff --git a/block/blk-core.c b/block/blk-core.c
index 40b57bf4852c..c166d46a09d1 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -33,6 +33,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define CREATE_TRACE_POINTS
 #include 
@@ -880,6 +881,8 @@ blk_init_allocated_queue(struct request_queue *q, 
request_fn_proc *rfn,
 
 fail:
blk_free_flush_queue(q->fq);
+   wbt_exit(q->rq_wb);
+   q->rq_wb = NULL;
return NULL;
 }
 EXPORT_SYMBOL(blk_init_allocated_queue);
@@ -1395,6 +1398,7 @@ void blk_requeue_request(struct request_queue *q, struct 
request *rq)
blk_delete_timer(rq);
blk_clear_rq_complete(rq);
trace_block_rq_requeue(q, rq);
+   wbt_requeue(q->rq_wb, >wb_stat);
 
if (rq->cmd_flags & REQ_QUEUED)
blk_queue_end_tag(q, rq);
@@ -1485,6 +1489,8 @@ void __blk_put_request(struct request_queue *q, struct 
request *req)
/* this is a bio leak */
WARN_ON(req->bio != NULL);
 
+   wbt_done(q->rq_wb, >wb_stat);
+
/*
 * Request may not have originated from ll_rw_blk. if not,
 * it didn't come out of our reserved rq pools
@@ -1714,6 +1720,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, 
struct bio *bio)
int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
struct request *req;
unsigned int request_count = 0;
+   bool wb_acct;
 
/*
 * low level driver can indicate that it wants pages above a
@@ -1766,6 +1773,8 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, 
struct bio *bio)
}
 
 get_rq:
+   wb_acct = wbt_wait(q->rq_wb, 

Re: [PATCH 8/8] writeback: throttle buffered writeback

2016-04-25 Thread Jens Axboe

On 04/25/2016 05:41 AM, xiakaixu wrote:

于 2016/4/24 5:37, Jens Axboe 写道:

On 04/23/2016 02:21 AM, xiakaixu wrote:

diff --git a/block/blk-core.c b/block/blk-core.c
index 40b57bf4852c..d941f69dfb4b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -39,6 +39,7 @@

   #include "blk.h"
   #include "blk-mq.h"
+#include "blk-wb.h"

   EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
   EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
@@ -880,6 +881,7 @@ blk_init_allocated_queue(struct request_queue *q, 
request_fn_proc *rfn,

   fail:
   blk_free_flush_queue(q->fq);
+blk_wb_exit(q);
   return NULL;
   }
   EXPORT_SYMBOL(blk_init_allocated_queue);
@@ -1395,6 +1397,7 @@ void blk_requeue_request(struct request_queue *q, struct 
request *rq)
   blk_delete_timer(rq);
   blk_clear_rq_complete(rq);
   trace_block_rq_requeue(q, rq);
+blk_wb_requeue(q->rq_wb, rq);

   if (rq->cmd_flags & REQ_QUEUED)
   blk_queue_end_tag(q, rq);
@@ -1485,6 +1488,8 @@ void __blk_put_request(struct request_queue *q, struct 
request *req)
   /* this is a bio leak */
   WARN_ON(req->bio != NULL);

+blk_wb_done(q->rq_wb, req);
+
   /*
* Request may not have originated from ll_rw_blk. if not,
* it didn't come out of our reserved rq pools
@@ -1714,6 +1719,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, 
struct bio *bio)
   int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
   struct request *req;
   unsigned int request_count = 0;
+bool wb_acct;

   /*
* low level driver can indicate that it wants pages above a
@@ -1766,6 +1772,8 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, 
struct bio *bio)
   }

   get_rq:
+wb_acct = blk_wb_wait(q->rq_wb, bio, q->queue_lock);
+
   /*
* This sync check and mask will be re-done in init_request_from_bio(),
* but we need to set it earlier to expose the sync flag to the
@@ -1781,11 +1789,16 @@ get_rq:
*/
   req = get_request(q, rw_flags, bio, GFP_NOIO);
   if (IS_ERR(req)) {
+if (wb_acct)
+__blk_wb_done(q->rq_wb);
   bio->bi_error = PTR_ERR(req);
   bio_endio(bio);
   goto out_unlock;
   }

+if (wb_acct)
+req->cmd_flags |= REQ_BUF_INFLIGHT;
+
   /*
* After dropping the lock and possibly sleeping here, our request
* may now be mergeable after it had proven unmergeable (above).
@@ -2515,6 +2528,7 @@ void blk_start_request(struct request *req)
   blk_dequeue_request(req);

   req->issue_time = ktime_to_ns(ktime_get());
+blk_wb_issue(req->q->rq_wb, req);

   /*
* We are now handing the request to the hardware, initialize
@@ -2751,6 +2765,7 @@ void blk_finish_request(struct request *req, int error)
   blk_unprep_request(req);

   blk_account_io_done(req);
+blk_wb_done(req->q->rq_wb, req);


Hi Jens,

Seems the function blk_wb_done() will be executed twice even if the end_io
callback is set.
Maybe the same thing would happen in blk-mq.c.


Yeah, that was a mistake, the current version has it fixed. It was inadvertently 
added when I discovered that the flush request didn't work properly. Now it just 
duplicates the call inside the check for if it has an ->end_io() defined, since 
we don't use the normal path for that.


Hi Jens,

I have checked the wb-buf-throttle branch in your block git repo. I am not sure 
it is the completed version.
Seems only the problem is fixed in blk-mq.c. The function blk_wb_done() still 
would be executed twice in blk-core.c.
(the functions blk_finish_request() and __blk_put_request())
Maybe we can add a flag to mark whether blk_wb_done() has been done or not.


Good catch, looks like I did only patch up the mq bits. It's still not 
perfect, since we could potentially double account a request that has a 
private end_io(), if it was allocated through the normal block rq 
allocator. It'll skew the unrelated-io-timestamp a bit, but it's not a 
big deal. The count for inflight will be consistent, which is the 
important part.


We currently have just 1 bit to tell if the request is tracked or not, 
so we don't know if it was tracked but already seen.


I'll fix up the blk-core part to be identical to the blk-mq fix.

--
Jens Axboe



Re: [PATCH 8/8] writeback: throttle buffered writeback

2016-04-25 Thread Jens Axboe

On 04/25/2016 05:41 AM, xiakaixu wrote:

于 2016/4/24 5:37, Jens Axboe 写道:

On 04/23/2016 02:21 AM, xiakaixu wrote:

diff --git a/block/blk-core.c b/block/blk-core.c
index 40b57bf4852c..d941f69dfb4b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -39,6 +39,7 @@

   #include "blk.h"
   #include "blk-mq.h"
+#include "blk-wb.h"

   EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
   EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
@@ -880,6 +881,7 @@ blk_init_allocated_queue(struct request_queue *q, 
request_fn_proc *rfn,

   fail:
   blk_free_flush_queue(q->fq);
+blk_wb_exit(q);
   return NULL;
   }
   EXPORT_SYMBOL(blk_init_allocated_queue);
@@ -1395,6 +1397,7 @@ void blk_requeue_request(struct request_queue *q, struct 
request *rq)
   blk_delete_timer(rq);
   blk_clear_rq_complete(rq);
   trace_block_rq_requeue(q, rq);
+blk_wb_requeue(q->rq_wb, rq);

   if (rq->cmd_flags & REQ_QUEUED)
   blk_queue_end_tag(q, rq);
@@ -1485,6 +1488,8 @@ void __blk_put_request(struct request_queue *q, struct 
request *req)
   /* this is a bio leak */
   WARN_ON(req->bio != NULL);

+blk_wb_done(q->rq_wb, req);
+
   /*
* Request may not have originated from ll_rw_blk. if not,
* it didn't come out of our reserved rq pools
@@ -1714,6 +1719,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, 
struct bio *bio)
   int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
   struct request *req;
   unsigned int request_count = 0;
+bool wb_acct;

   /*
* low level driver can indicate that it wants pages above a
@@ -1766,6 +1772,8 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, 
struct bio *bio)
   }

   get_rq:
+wb_acct = blk_wb_wait(q->rq_wb, bio, q->queue_lock);
+
   /*
* This sync check and mask will be re-done in init_request_from_bio(),
* but we need to set it earlier to expose the sync flag to the
@@ -1781,11 +1789,16 @@ get_rq:
*/
   req = get_request(q, rw_flags, bio, GFP_NOIO);
   if (IS_ERR(req)) {
+if (wb_acct)
+__blk_wb_done(q->rq_wb);
   bio->bi_error = PTR_ERR(req);
   bio_endio(bio);
   goto out_unlock;
   }

+if (wb_acct)
+req->cmd_flags |= REQ_BUF_INFLIGHT;
+
   /*
* After dropping the lock and possibly sleeping here, our request
* may now be mergeable after it had proven unmergeable (above).
@@ -2515,6 +2528,7 @@ void blk_start_request(struct request *req)
   blk_dequeue_request(req);

   req->issue_time = ktime_to_ns(ktime_get());
+blk_wb_issue(req->q->rq_wb, req);

   /*
* We are now handing the request to the hardware, initialize
@@ -2751,6 +2765,7 @@ void blk_finish_request(struct request *req, int error)
   blk_unprep_request(req);

   blk_account_io_done(req);
+blk_wb_done(req->q->rq_wb, req);


Hi Jens,

Seems the function blk_wb_done() will be executed twice even if the end_io
callback is set.
Maybe the same thing would happen in blk-mq.c.


Yeah, that was a mistake, the current version has it fixed. It was inadvertently 
added when I discovered that the flush request didn't work properly. Now it just 
duplicates the call inside the check for if it has an ->end_io() defined, since 
we don't use the normal path for that.


Hi Jens,

I have checked the wb-buf-throttle branch in your block git repo. I am not sure 
it is the completed version.
Seems only the problem is fixed in blk-mq.c. The function blk_wb_done() still 
would be executed twice in blk-core.c.
(the functions blk_finish_request() and __blk_put_request())
Maybe we can add a flag to mark whether blk_wb_done() has been done or not.


Good catch, looks like I did only patch up the mq bits. It's still not 
perfect, since we could potentially double account a request that has a 
private end_io(), if it was allocated through the normal block rq 
allocator. It'll skew the unrelated-io-timestamp a bit, but it's not a 
big deal. The count for inflight will be consistent, which is the 
important part.


We currently have just 1 bit to tell if the request is tracked or not, 
so we don't know if it was tracked but already seen.


I'll fix up the blk-core part to be identical to the blk-mq fix.

--
Jens Axboe



Re: [PATCH 8/8] writeback: throttle buffered writeback

2016-04-25 Thread xiakaixu
于 2016/4/24 5:37, Jens Axboe 写道:
> On 04/23/2016 02:21 AM, xiakaixu wrote:
>>> diff --git a/block/blk-core.c b/block/blk-core.c
>>> index 40b57bf4852c..d941f69dfb4b 100644
>>> --- a/block/blk-core.c
>>> +++ b/block/blk-core.c
>>> @@ -39,6 +39,7 @@
>>>
>>>   #include "blk.h"
>>>   #include "blk-mq.h"
>>> +#include "blk-wb.h"
>>>
>>>   EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
>>>   EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
>>> @@ -880,6 +881,7 @@ blk_init_allocated_queue(struct request_queue *q, 
>>> request_fn_proc *rfn,
>>>
>>>   fail:
>>>   blk_free_flush_queue(q->fq);
>>> +blk_wb_exit(q);
>>>   return NULL;
>>>   }
>>>   EXPORT_SYMBOL(blk_init_allocated_queue);
>>> @@ -1395,6 +1397,7 @@ void blk_requeue_request(struct request_queue *q, 
>>> struct request *rq)
>>>   blk_delete_timer(rq);
>>>   blk_clear_rq_complete(rq);
>>>   trace_block_rq_requeue(q, rq);
>>> +blk_wb_requeue(q->rq_wb, rq);
>>>
>>>   if (rq->cmd_flags & REQ_QUEUED)
>>>   blk_queue_end_tag(q, rq);
>>> @@ -1485,6 +1488,8 @@ void __blk_put_request(struct request_queue *q, 
>>> struct request *req)
>>>   /* this is a bio leak */
>>>   WARN_ON(req->bio != NULL);
>>>
>>> +blk_wb_done(q->rq_wb, req);
>>> +
>>>   /*
>>>* Request may not have originated from ll_rw_blk. if not,
>>>* it didn't come out of our reserved rq pools
>>> @@ -1714,6 +1719,7 @@ static blk_qc_t blk_queue_bio(struct request_queue 
>>> *q, struct bio *bio)
>>>   int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
>>>   struct request *req;
>>>   unsigned int request_count = 0;
>>> +bool wb_acct;
>>>
>>>   /*
>>>* low level driver can indicate that it wants pages above a
>>> @@ -1766,6 +1772,8 @@ static blk_qc_t blk_queue_bio(struct request_queue 
>>> *q, struct bio *bio)
>>>   }
>>>
>>>   get_rq:
>>> +wb_acct = blk_wb_wait(q->rq_wb, bio, q->queue_lock);
>>> +
>>>   /*
>>>* This sync check and mask will be re-done in 
>>> init_request_from_bio(),
>>>* but we need to set it earlier to expose the sync flag to the
>>> @@ -1781,11 +1789,16 @@ get_rq:
>>>*/
>>>   req = get_request(q, rw_flags, bio, GFP_NOIO);
>>>   if (IS_ERR(req)) {
>>> +if (wb_acct)
>>> +__blk_wb_done(q->rq_wb);
>>>   bio->bi_error = PTR_ERR(req);
>>>   bio_endio(bio);
>>>   goto out_unlock;
>>>   }
>>>
>>> +if (wb_acct)
>>> +req->cmd_flags |= REQ_BUF_INFLIGHT;
>>> +
>>>   /*
>>>* After dropping the lock and possibly sleeping here, our request
>>>* may now be mergeable after it had proven unmergeable (above).
>>> @@ -2515,6 +2528,7 @@ void blk_start_request(struct request *req)
>>>   blk_dequeue_request(req);
>>>
>>>   req->issue_time = ktime_to_ns(ktime_get());
>>> +blk_wb_issue(req->q->rq_wb, req);
>>>
>>>   /*
>>>* We are now handing the request to the hardware, initialize
>>> @@ -2751,6 +2765,7 @@ void blk_finish_request(struct request *req, int 
>>> error)
>>>   blk_unprep_request(req);
>>>
>>>   blk_account_io_done(req);
>>> +blk_wb_done(req->q->rq_wb, req);
>>
>> Hi Jens,
>>
>> Seems the function blk_wb_done() will be executed twice even if the end_io
>> callback is set.
>> Maybe the same thing would happen in blk-mq.c.
> 
> Yeah, that was a mistake, the current version has it fixed. It was 
> inadvertently added when I discovered that the flush request didn't work 
> properly. Now it just duplicates the call inside the check for if it has an 
> ->end_io() defined, since we don't use the normal path for that.
>
Hi Jens,

I have checked the wb-buf-throttle branch in your block git repo. I am not sure 
it is the completed version.
Seems only the problem is fixed in blk-mq.c. The function blk_wb_done() still 
would be executed twice in blk-core.c.
(the functions blk_finish_request() and __blk_put_request())
Maybe we can add a flag to mark whether blk_wb_done() has been done or not.



-- 
Regards
Kaixu Xia



Re: [PATCH 8/8] writeback: throttle buffered writeback

2016-04-25 Thread xiakaixu
于 2016/4/24 5:37, Jens Axboe 写道:
> On 04/23/2016 02:21 AM, xiakaixu wrote:
>>> diff --git a/block/blk-core.c b/block/blk-core.c
>>> index 40b57bf4852c..d941f69dfb4b 100644
>>> --- a/block/blk-core.c
>>> +++ b/block/blk-core.c
>>> @@ -39,6 +39,7 @@
>>>
>>>   #include "blk.h"
>>>   #include "blk-mq.h"
>>> +#include "blk-wb.h"
>>>
>>>   EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
>>>   EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
>>> @@ -880,6 +881,7 @@ blk_init_allocated_queue(struct request_queue *q, 
>>> request_fn_proc *rfn,
>>>
>>>   fail:
>>>   blk_free_flush_queue(q->fq);
>>> +blk_wb_exit(q);
>>>   return NULL;
>>>   }
>>>   EXPORT_SYMBOL(blk_init_allocated_queue);
>>> @@ -1395,6 +1397,7 @@ void blk_requeue_request(struct request_queue *q, 
>>> struct request *rq)
>>>   blk_delete_timer(rq);
>>>   blk_clear_rq_complete(rq);
>>>   trace_block_rq_requeue(q, rq);
>>> +blk_wb_requeue(q->rq_wb, rq);
>>>
>>>   if (rq->cmd_flags & REQ_QUEUED)
>>>   blk_queue_end_tag(q, rq);
>>> @@ -1485,6 +1488,8 @@ void __blk_put_request(struct request_queue *q, 
>>> struct request *req)
>>>   /* this is a bio leak */
>>>   WARN_ON(req->bio != NULL);
>>>
>>> +blk_wb_done(q->rq_wb, req);
>>> +
>>>   /*
>>>* Request may not have originated from ll_rw_blk. if not,
>>>* it didn't come out of our reserved rq pools
>>> @@ -1714,6 +1719,7 @@ static blk_qc_t blk_queue_bio(struct request_queue 
>>> *q, struct bio *bio)
>>>   int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
>>>   struct request *req;
>>>   unsigned int request_count = 0;
>>> +bool wb_acct;
>>>
>>>   /*
>>>* low level driver can indicate that it wants pages above a
>>> @@ -1766,6 +1772,8 @@ static blk_qc_t blk_queue_bio(struct request_queue 
>>> *q, struct bio *bio)
>>>   }
>>>
>>>   get_rq:
>>> +wb_acct = blk_wb_wait(q->rq_wb, bio, q->queue_lock);
>>> +
>>>   /*
>>>* This sync check and mask will be re-done in 
>>> init_request_from_bio(),
>>>* but we need to set it earlier to expose the sync flag to the
>>> @@ -1781,11 +1789,16 @@ get_rq:
>>>*/
>>>   req = get_request(q, rw_flags, bio, GFP_NOIO);
>>>   if (IS_ERR(req)) {
>>> +if (wb_acct)
>>> +__blk_wb_done(q->rq_wb);
>>>   bio->bi_error = PTR_ERR(req);
>>>   bio_endio(bio);
>>>   goto out_unlock;
>>>   }
>>>
>>> +if (wb_acct)
>>> +req->cmd_flags |= REQ_BUF_INFLIGHT;
>>> +
>>>   /*
>>>* After dropping the lock and possibly sleeping here, our request
>>>* may now be mergeable after it had proven unmergeable (above).
>>> @@ -2515,6 +2528,7 @@ void blk_start_request(struct request *req)
>>>   blk_dequeue_request(req);
>>>
>>>   req->issue_time = ktime_to_ns(ktime_get());
>>> +blk_wb_issue(req->q->rq_wb, req);
>>>
>>>   /*
>>>* We are now handing the request to the hardware, initialize
>>> @@ -2751,6 +2765,7 @@ void blk_finish_request(struct request *req, int 
>>> error)
>>>   blk_unprep_request(req);
>>>
>>>   blk_account_io_done(req);
>>> +blk_wb_done(req->q->rq_wb, req);
>>
>> Hi Jens,
>>
>> Seems the function blk_wb_done() will be executed twice even if the end_io
>> callback is set.
>> Maybe the same thing would happen in blk-mq.c.
> 
> Yeah, that was a mistake, the current version has it fixed. It was 
> inadvertently added when I discovered that the flush request didn't work 
> properly. Now it just duplicates the call inside the check for if it has an 
> ->end_io() defined, since we don't use the normal path for that.
>
Hi Jens,

I have checked the wb-buf-throttle branch in your block git repo. I am not sure 
it is the completed version.
Seems only the problem is fixed in blk-mq.c. The function blk_wb_done() still 
would be executed twice in blk-core.c.
(the functions blk_finish_request() and __blk_put_request())
Maybe we can add a flag to mark whether blk_wb_done() has been done or not.



-- 
Regards
Kaixu Xia



Re: [PATCH 8/8] writeback: throttle buffered writeback

2016-04-23 Thread Jens Axboe

On 04/23/2016 02:21 AM, xiakaixu wrote:

diff --git a/block/blk-core.c b/block/blk-core.c
index 40b57bf4852c..d941f69dfb4b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -39,6 +39,7 @@

  #include "blk.h"
  #include "blk-mq.h"
+#include "blk-wb.h"

  EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
  EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
@@ -880,6 +881,7 @@ blk_init_allocated_queue(struct request_queue *q, 
request_fn_proc *rfn,

  fail:
blk_free_flush_queue(q->fq);
+   blk_wb_exit(q);
return NULL;
  }
  EXPORT_SYMBOL(blk_init_allocated_queue);
@@ -1395,6 +1397,7 @@ void blk_requeue_request(struct request_queue *q, struct 
request *rq)
blk_delete_timer(rq);
blk_clear_rq_complete(rq);
trace_block_rq_requeue(q, rq);
+   blk_wb_requeue(q->rq_wb, rq);

if (rq->cmd_flags & REQ_QUEUED)
blk_queue_end_tag(q, rq);
@@ -1485,6 +1488,8 @@ void __blk_put_request(struct request_queue *q, struct 
request *req)
/* this is a bio leak */
WARN_ON(req->bio != NULL);

+   blk_wb_done(q->rq_wb, req);
+
/*
 * Request may not have originated from ll_rw_blk. if not,
 * it didn't come out of our reserved rq pools
@@ -1714,6 +1719,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, 
struct bio *bio)
int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
struct request *req;
unsigned int request_count = 0;
+   bool wb_acct;

/*
 * low level driver can indicate that it wants pages above a
@@ -1766,6 +1772,8 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, 
struct bio *bio)
}

  get_rq:
+   wb_acct = blk_wb_wait(q->rq_wb, bio, q->queue_lock);
+
/*
 * This sync check and mask will be re-done in init_request_from_bio(),
 * but we need to set it earlier to expose the sync flag to the
@@ -1781,11 +1789,16 @@ get_rq:
 */
req = get_request(q, rw_flags, bio, GFP_NOIO);
if (IS_ERR(req)) {
+   if (wb_acct)
+   __blk_wb_done(q->rq_wb);
bio->bi_error = PTR_ERR(req);
bio_endio(bio);
goto out_unlock;
}

+   if (wb_acct)
+   req->cmd_flags |= REQ_BUF_INFLIGHT;
+
/*
 * After dropping the lock and possibly sleeping here, our request
 * may now be mergeable after it had proven unmergeable (above).
@@ -2515,6 +2528,7 @@ void blk_start_request(struct request *req)
blk_dequeue_request(req);

req->issue_time = ktime_to_ns(ktime_get());
+   blk_wb_issue(req->q->rq_wb, req);

/*
 * We are now handing the request to the hardware, initialize
@@ -2751,6 +2765,7 @@ void blk_finish_request(struct request *req, int error)
blk_unprep_request(req);

blk_account_io_done(req);
+   blk_wb_done(req->q->rq_wb, req);


Hi Jens,

Seems the function blk_wb_done() will be executed twice even if the end_io
callback is set.
Maybe the same thing would happen in blk-mq.c.


Yeah, that was a mistake, the current version has it fixed. It was 
inadvertently added when I discovered that the flush request didn't work 
properly. Now it just duplicates the call inside the check for if it has 
an ->end_io() defined, since we don't use the normal path for that.


--
Jens Axboe



Re: [PATCH 8/8] writeback: throttle buffered writeback

2016-04-23 Thread Jens Axboe

On 04/23/2016 02:21 AM, xiakaixu wrote:

diff --git a/block/blk-core.c b/block/blk-core.c
index 40b57bf4852c..d941f69dfb4b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -39,6 +39,7 @@

  #include "blk.h"
  #include "blk-mq.h"
+#include "blk-wb.h"

  EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
  EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
@@ -880,6 +881,7 @@ blk_init_allocated_queue(struct request_queue *q, 
request_fn_proc *rfn,

  fail:
blk_free_flush_queue(q->fq);
+   blk_wb_exit(q);
return NULL;
  }
  EXPORT_SYMBOL(blk_init_allocated_queue);
@@ -1395,6 +1397,7 @@ void blk_requeue_request(struct request_queue *q, struct 
request *rq)
blk_delete_timer(rq);
blk_clear_rq_complete(rq);
trace_block_rq_requeue(q, rq);
+   blk_wb_requeue(q->rq_wb, rq);

if (rq->cmd_flags & REQ_QUEUED)
blk_queue_end_tag(q, rq);
@@ -1485,6 +1488,8 @@ void __blk_put_request(struct request_queue *q, struct 
request *req)
/* this is a bio leak */
WARN_ON(req->bio != NULL);

+   blk_wb_done(q->rq_wb, req);
+
/*
 * Request may not have originated from ll_rw_blk. if not,
 * it didn't come out of our reserved rq pools
@@ -1714,6 +1719,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, 
struct bio *bio)
int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
struct request *req;
unsigned int request_count = 0;
+   bool wb_acct;

/*
 * low level driver can indicate that it wants pages above a
@@ -1766,6 +1772,8 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, 
struct bio *bio)
}

  get_rq:
+   wb_acct = blk_wb_wait(q->rq_wb, bio, q->queue_lock);
+
/*
 * This sync check and mask will be re-done in init_request_from_bio(),
 * but we need to set it earlier to expose the sync flag to the
@@ -1781,11 +1789,16 @@ get_rq:
 */
req = get_request(q, rw_flags, bio, GFP_NOIO);
if (IS_ERR(req)) {
+   if (wb_acct)
+   __blk_wb_done(q->rq_wb);
bio->bi_error = PTR_ERR(req);
bio_endio(bio);
goto out_unlock;
}

+   if (wb_acct)
+   req->cmd_flags |= REQ_BUF_INFLIGHT;
+
/*
 * After dropping the lock and possibly sleeping here, our request
 * may now be mergeable after it had proven unmergeable (above).
@@ -2515,6 +2528,7 @@ void blk_start_request(struct request *req)
blk_dequeue_request(req);

req->issue_time = ktime_to_ns(ktime_get());
+   blk_wb_issue(req->q->rq_wb, req);

/*
 * We are now handing the request to the hardware, initialize
@@ -2751,6 +2765,7 @@ void blk_finish_request(struct request *req, int error)
blk_unprep_request(req);

blk_account_io_done(req);
+   blk_wb_done(req->q->rq_wb, req);


Hi Jens,

Seems the function blk_wb_done() will be executed twice even if the end_io
callback is set.
Maybe the same thing would happen in blk-mq.c.


Yeah, that was a mistake, the current version has it fixed. It was 
inadvertently added when I discovered that the flush request didn't work 
properly. Now it just duplicates the call inside the check for if it has 
an ->end_io() defined, since we don't use the normal path for that.


--
Jens Axboe



Re: [PATCH 8/8] writeback: throttle buffered writeback

2016-04-23 Thread xiakaixu
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 40b57bf4852c..d941f69dfb4b 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -39,6 +39,7 @@
>  
>  #include "blk.h"
>  #include "blk-mq.h"
> +#include "blk-wb.h"
>  
>  EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
>  EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
> @@ -880,6 +881,7 @@ blk_init_allocated_queue(struct request_queue *q, 
> request_fn_proc *rfn,
>  
>  fail:
>   blk_free_flush_queue(q->fq);
> + blk_wb_exit(q);
>   return NULL;
>  }
>  EXPORT_SYMBOL(blk_init_allocated_queue);
> @@ -1395,6 +1397,7 @@ void blk_requeue_request(struct request_queue *q, 
> struct request *rq)
>   blk_delete_timer(rq);
>   blk_clear_rq_complete(rq);
>   trace_block_rq_requeue(q, rq);
> + blk_wb_requeue(q->rq_wb, rq);
>  
>   if (rq->cmd_flags & REQ_QUEUED)
>   blk_queue_end_tag(q, rq);
> @@ -1485,6 +1488,8 @@ void __blk_put_request(struct request_queue *q, struct 
> request *req)
>   /* this is a bio leak */
>   WARN_ON(req->bio != NULL);
>  
> + blk_wb_done(q->rq_wb, req);
> +
>   /*
>* Request may not have originated from ll_rw_blk. if not,
>* it didn't come out of our reserved rq pools
> @@ -1714,6 +1719,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, 
> struct bio *bio)
>   int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
>   struct request *req;
>   unsigned int request_count = 0;
> + bool wb_acct;
>  
>   /*
>* low level driver can indicate that it wants pages above a
> @@ -1766,6 +1772,8 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, 
> struct bio *bio)
>   }
>  
>  get_rq:
> + wb_acct = blk_wb_wait(q->rq_wb, bio, q->queue_lock);
> +
>   /*
>* This sync check and mask will be re-done in init_request_from_bio(),
>* but we need to set it earlier to expose the sync flag to the
> @@ -1781,11 +1789,16 @@ get_rq:
>*/
>   req = get_request(q, rw_flags, bio, GFP_NOIO);
>   if (IS_ERR(req)) {
> + if (wb_acct)
> + __blk_wb_done(q->rq_wb);
>   bio->bi_error = PTR_ERR(req);
>   bio_endio(bio);
>   goto out_unlock;
>   }
>  
> + if (wb_acct)
> + req->cmd_flags |= REQ_BUF_INFLIGHT;
> +
>   /*
>* After dropping the lock and possibly sleeping here, our request
>* may now be mergeable after it had proven unmergeable (above).
> @@ -2515,6 +2528,7 @@ void blk_start_request(struct request *req)
>   blk_dequeue_request(req);
>  
>   req->issue_time = ktime_to_ns(ktime_get());
> + blk_wb_issue(req->q->rq_wb, req);
>  
>   /*
>* We are now handing the request to the hardware, initialize
> @@ -2751,6 +2765,7 @@ void blk_finish_request(struct request *req, int error)
>   blk_unprep_request(req);
>  
>   blk_account_io_done(req);
> + blk_wb_done(req->q->rq_wb, req);

Hi Jens,

Seems the function blk_wb_done() will be executed twice even if the end_io
callback is set.
Maybe the same thing would happen in blk-mq.c.

>  
>   if (req->end_io)
>   req->end_io(req, error);
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 71b4a13fbf94..c0c5207fe7fd 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -30,6 +30,7 @@
>  #include "blk-mq.h"
>  #include "blk-mq-tag.h"
>  #include "blk-stat.h"
> +#include "blk-wb.h"
>  
>  static DEFINE_MUTEX(all_q_mutex);
>  static LIST_HEAD(all_q_list);
> @@ -275,6 +276,9 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx 
> *hctx,
>  
>   if (rq->cmd_flags & REQ_MQ_INFLIGHT)
>   atomic_dec(>nr_active);
> +
> + blk_wb_done(q->rq_wb, rq);
> +
>   rq->cmd_flags = 0;
>  
>   clear_bit(REQ_ATOM_STARTED, >atomic_flags);
> @@ -305,6 +309,7 @@ EXPORT_SYMBOL_GPL(blk_mq_free_request);
>  inline void __blk_mq_end_request(struct request *rq, int error)
>  {
>   blk_account_io_done(rq);
> + blk_wb_done(rq->q->rq_wb, rq);
>  
>   if (rq->end_io) {
>   rq->end_io(rq, error);
> @@ -414,6 +419,7 @@ void blk_mq_start_request(struct request *rq)
>   rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq);
>  
>   rq->issue_time = ktime_to_ns(ktime_get());
> + blk_wb_issue(q->rq_wb, rq);
>  
>   blk_add_timer(rq);
>  
> @@ -450,6 +456,7 @@ static void __blk_mq_requeue_request(struct request *rq)
>   struct request_queue *q = rq->q;
>  
>   trace_block_rq_requeue(q, rq);
> + blk_wb_requeue(q->rq_wb, rq);
>  
>   if (test_and_clear_bit(REQ_ATOM_STARTED, >atomic_flags)) {
>   if (q->dma_drain_size && blk_rq_bytes(rq))
> @@ -1265,6 +1272,7 @@ static blk_qc_t blk_mq_make_request(struct 
> request_queue *q, struct bio *bio)
>   struct blk_plug *plug;
>   struct request *same_queue_rq = NULL;
>   blk_qc_t cookie;
> + bool wb_acct;
>  
>   blk_queue_bounce(q, );
>  
> @@ -1282,9 +1290,17 @@ static 

Re: [PATCH 8/8] writeback: throttle buffered writeback

2016-04-23 Thread xiakaixu
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 40b57bf4852c..d941f69dfb4b 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -39,6 +39,7 @@
>  
>  #include "blk.h"
>  #include "blk-mq.h"
> +#include "blk-wb.h"
>  
>  EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
>  EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
> @@ -880,6 +881,7 @@ blk_init_allocated_queue(struct request_queue *q, 
> request_fn_proc *rfn,
>  
>  fail:
>   blk_free_flush_queue(q->fq);
> + blk_wb_exit(q);
>   return NULL;
>  }
>  EXPORT_SYMBOL(blk_init_allocated_queue);
> @@ -1395,6 +1397,7 @@ void blk_requeue_request(struct request_queue *q, 
> struct request *rq)
>   blk_delete_timer(rq);
>   blk_clear_rq_complete(rq);
>   trace_block_rq_requeue(q, rq);
> + blk_wb_requeue(q->rq_wb, rq);
>  
>   if (rq->cmd_flags & REQ_QUEUED)
>   blk_queue_end_tag(q, rq);
> @@ -1485,6 +1488,8 @@ void __blk_put_request(struct request_queue *q, struct 
> request *req)
>   /* this is a bio leak */
>   WARN_ON(req->bio != NULL);
>  
> + blk_wb_done(q->rq_wb, req);
> +
>   /*
>* Request may not have originated from ll_rw_blk. if not,
>* it didn't come out of our reserved rq pools
> @@ -1714,6 +1719,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, 
> struct bio *bio)
>   int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
>   struct request *req;
>   unsigned int request_count = 0;
> + bool wb_acct;
>  
>   /*
>* low level driver can indicate that it wants pages above a
> @@ -1766,6 +1772,8 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, 
> struct bio *bio)
>   }
>  
>  get_rq:
> + wb_acct = blk_wb_wait(q->rq_wb, bio, q->queue_lock);
> +
>   /*
>* This sync check and mask will be re-done in init_request_from_bio(),
>* but we need to set it earlier to expose the sync flag to the
> @@ -1781,11 +1789,16 @@ get_rq:
>*/
>   req = get_request(q, rw_flags, bio, GFP_NOIO);
>   if (IS_ERR(req)) {
> + if (wb_acct)
> + __blk_wb_done(q->rq_wb);
>   bio->bi_error = PTR_ERR(req);
>   bio_endio(bio);
>   goto out_unlock;
>   }
>  
> + if (wb_acct)
> + req->cmd_flags |= REQ_BUF_INFLIGHT;
> +
>   /*
>* After dropping the lock and possibly sleeping here, our request
>* may now be mergeable after it had proven unmergeable (above).
> @@ -2515,6 +2528,7 @@ void blk_start_request(struct request *req)
>   blk_dequeue_request(req);
>  
>   req->issue_time = ktime_to_ns(ktime_get());
> + blk_wb_issue(req->q->rq_wb, req);
>  
>   /*
>* We are now handing the request to the hardware, initialize
> @@ -2751,6 +2765,7 @@ void blk_finish_request(struct request *req, int error)
>   blk_unprep_request(req);
>  
>   blk_account_io_done(req);
> + blk_wb_done(req->q->rq_wb, req);

Hi Jens,

Seems the function blk_wb_done() will be executed twice even if the end_io
callback is set.
Maybe the same thing would happen in blk-mq.c.

>  
>   if (req->end_io)
>   req->end_io(req, error);
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 71b4a13fbf94..c0c5207fe7fd 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -30,6 +30,7 @@
>  #include "blk-mq.h"
>  #include "blk-mq-tag.h"
>  #include "blk-stat.h"
> +#include "blk-wb.h"
>  
>  static DEFINE_MUTEX(all_q_mutex);
>  static LIST_HEAD(all_q_list);
> @@ -275,6 +276,9 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx 
> *hctx,
>  
>   if (rq->cmd_flags & REQ_MQ_INFLIGHT)
>   atomic_dec(>nr_active);
> +
> + blk_wb_done(q->rq_wb, rq);
> +
>   rq->cmd_flags = 0;
>  
>   clear_bit(REQ_ATOM_STARTED, >atomic_flags);
> @@ -305,6 +309,7 @@ EXPORT_SYMBOL_GPL(blk_mq_free_request);
>  inline void __blk_mq_end_request(struct request *rq, int error)
>  {
>   blk_account_io_done(rq);
> + blk_wb_done(rq->q->rq_wb, rq);
>  
>   if (rq->end_io) {
>   rq->end_io(rq, error);
> @@ -414,6 +419,7 @@ void blk_mq_start_request(struct request *rq)
>   rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq);
>  
>   rq->issue_time = ktime_to_ns(ktime_get());
> + blk_wb_issue(q->rq_wb, rq);
>  
>   blk_add_timer(rq);
>  
> @@ -450,6 +456,7 @@ static void __blk_mq_requeue_request(struct request *rq)
>   struct request_queue *q = rq->q;
>  
>   trace_block_rq_requeue(q, rq);
> + blk_wb_requeue(q->rq_wb, rq);
>  
>   if (test_and_clear_bit(REQ_ATOM_STARTED, >atomic_flags)) {
>   if (q->dma_drain_size && blk_rq_bytes(rq))
> @@ -1265,6 +1272,7 @@ static blk_qc_t blk_mq_make_request(struct 
> request_queue *q, struct bio *bio)
>   struct blk_plug *plug;
>   struct request *same_queue_rq = NULL;
>   blk_qc_t cookie;
> + bool wb_acct;
>  
>   blk_queue_bounce(q, );
>  
> @@ -1282,9 +1290,17 @@ static 

[PATCH 8/8] writeback: throttle buffered writeback

2016-04-17 Thread Jens Axboe
Test patch that throttles buffered writeback to make it a lot
more smooth, and has way less impact on other system activity.
Background writeback should be, by definition, background
activity. The fact that we flush huge bundles of it at the time
means that it potentially has heavy impacts on foreground workloads,
which isn't ideal. We can't easily limit the sizes of writes that
we do, since that would impact file system layout in the presence
of delayed allocation. So just throttle back buffered writeback,
unless someone is waiting for it.

The algorithm for when to throttle takes its inspiration in the
CoDel networking scheduling algorithm. Like CoDel, blk-wb monitors
the minimum latencies of requests over a window of time. In that
window of time, if the minimum latency of any request exceeds a
given target, then a scale count is incremented and the queue depth
is shrunk. The next monitoring window is shrunk accordingly. Unlike
CoDel, if we hit a window that exhibits good behavior, then we
simply increment the scale count and re-calculate the limits for that
scale value. This prevents us from oscillating between a
close-to-ideal value and max all the time, instead remaining in the
windows where we get good behavior.

The patch registers two sysfs entries. The first one, 'wb_lat_usec',
sets the latency target for the window. It defaults to 2 msec for
non-rotational storage, and 75 msec for rotational storage. Setting
this value to '0' disables blk-wb.

The second entry, 'wb_stats', is a debug entry, that simply shows the
current internal state of the throttling machine:

$ cat /sys/block/nvme0n1/queue/wb_stats
background=16, normal=32, max=64, inflight=0, wait=0, bdp_wait=0

'background' denotes how many requests we will allow in-flight for
idle background buffered writeback, 'normal' for higher priority
writeback, and 'max' for when it's urgent we clean pages.

'inflight' shows how many requests are currently in-flight for
buffered writeback, 'wait' shows if anyone is currently waiting for
access, and 'bdp_wait' shows if someone is currently throttled on this
device in balance_dirty_pages().

blk-wb also registers a few trace events, that can be used to monitor
the state changes:

block_wb_lat: Latency 2446318

block_wb_stat:  read lat: mean=2446318, min=2446318, max=2446318, samples=1,
   write lat: mean=518866, min=15522, max=5330353, samples=57

block_wb_step: step down: step=1, background=8, normal=16, max=32

'block_wb_lat' logs a violation in sync issue latency, 'block_wb_stat'
logs a window violation of latencies and dumps the stats that lead to
that, and finally, 'block_wb_stat' logs a step up/down and the new
limits associated with that state.

Signed-off-by: Jens Axboe 
---
 block/Makefile   |   2 +-
 block/blk-core.c |  15 ++
 block/blk-mq.c   |  31 ++-
 block/blk-settings.c |   4 +
 block/blk-sysfs.c|  57 +
 block/blk-wb.c   | 495 +++
 block/blk-wb.h   |  42 
 include/linux/blk_types.h|   2 +
 include/linux/blkdev.h   |   3 +
 include/trace/events/block.h |  98 +
 10 files changed, 746 insertions(+), 3 deletions(-)
 create mode 100644 block/blk-wb.c
 create mode 100644 block/blk-wb.h

diff --git a/block/Makefile b/block/Makefile
index 3446e0472df0..7e4be7a56a59 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,7 +5,7 @@
 obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
-   blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
+   blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o blk-wb.o \
blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
badblocks.o partitions/
diff --git a/block/blk-core.c b/block/blk-core.c
index 40b57bf4852c..d941f69dfb4b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -39,6 +39,7 @@
 
 #include "blk.h"
 #include "blk-mq.h"
+#include "blk-wb.h"
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
@@ -880,6 +881,7 @@ blk_init_allocated_queue(struct request_queue *q, 
request_fn_proc *rfn,
 
 fail:
blk_free_flush_queue(q->fq);
+   blk_wb_exit(q);
return NULL;
 }
 EXPORT_SYMBOL(blk_init_allocated_queue);
@@ -1395,6 +1397,7 @@ void blk_requeue_request(struct request_queue *q, struct 
request *rq)
blk_delete_timer(rq);
blk_clear_rq_complete(rq);
trace_block_rq_requeue(q, rq);
+   blk_wb_requeue(q->rq_wb, rq);
 
if (rq->cmd_flags & REQ_QUEUED)
blk_queue_end_tag(q, rq);
@@ -1485,6 +1488,8 @@ void __blk_put_request(struct request_queue *q, struct 
request *req)
   

[PATCH 8/8] writeback: throttle buffered writeback

2016-04-17 Thread Jens Axboe
Test patch that throttles buffered writeback to make it a lot
more smooth, and has way less impact on other system activity.
Background writeback should be, by definition, background
activity. The fact that we flush huge bundles of it at the time
means that it potentially has heavy impacts on foreground workloads,
which isn't ideal. We can't easily limit the sizes of writes that
we do, since that would impact file system layout in the presence
of delayed allocation. So just throttle back buffered writeback,
unless someone is waiting for it.

The algorithm for when to throttle takes its inspiration in the
CoDel networking scheduling algorithm. Like CoDel, blk-wb monitors
the minimum latencies of requests over a window of time. In that
window of time, if the minimum latency of any request exceeds a
given target, then a scale count is incremented and the queue depth
is shrunk. The next monitoring window is shrunk accordingly. Unlike
CoDel, if we hit a window that exhibits good behavior, then we
simply increment the scale count and re-calculate the limits for that
scale value. This prevents us from oscillating between a
close-to-ideal value and max all the time, instead remaining in the
windows where we get good behavior.

The patch registers two sysfs entries. The first one, 'wb_lat_usec',
sets the latency target for the window. It defaults to 2 msec for
non-rotational storage, and 75 msec for rotational storage. Setting
this value to '0' disables blk-wb.

The second entry, 'wb_stats', is a debug entry, that simply shows the
current internal state of the throttling machine:

$ cat /sys/block/nvme0n1/queue/wb_stats
background=16, normal=32, max=64, inflight=0, wait=0, bdp_wait=0

'background' denotes how many requests we will allow in-flight for
idle background buffered writeback, 'normal' for higher priority
writeback, and 'max' for when it's urgent we clean pages.

'inflight' shows how many requests are currently in-flight for
buffered writeback, 'wait' shows if anyone is currently waiting for
access, and 'bdp_wait' shows if someone is currently throttled on this
device in balance_dirty_pages().

blk-wb also registers a few trace events, that can be used to monitor
the state changes:

block_wb_lat: Latency 2446318

block_wb_stat:  read lat: mean=2446318, min=2446318, max=2446318, samples=1,
   write lat: mean=518866, min=15522, max=5330353, samples=57

block_wb_step: step down: step=1, background=8, normal=16, max=32

'block_wb_lat' logs a violation in sync issue latency, 'block_wb_stat'
logs a window violation of latencies and dumps the stats that lead to
that, and finally, 'block_wb_stat' logs a step up/down and the new
limits associated with that state.

Signed-off-by: Jens Axboe 
---
 block/Makefile   |   2 +-
 block/blk-core.c |  15 ++
 block/blk-mq.c   |  31 ++-
 block/blk-settings.c |   4 +
 block/blk-sysfs.c|  57 +
 block/blk-wb.c   | 495 +++
 block/blk-wb.h   |  42 
 include/linux/blk_types.h|   2 +
 include/linux/blkdev.h   |   3 +
 include/trace/events/block.h |  98 +
 10 files changed, 746 insertions(+), 3 deletions(-)
 create mode 100644 block/blk-wb.c
 create mode 100644 block/blk-wb.h

diff --git a/block/Makefile b/block/Makefile
index 3446e0472df0..7e4be7a56a59 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,7 +5,7 @@
 obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
-   blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
+   blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o blk-wb.o \
blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
badblocks.o partitions/
diff --git a/block/blk-core.c b/block/blk-core.c
index 40b57bf4852c..d941f69dfb4b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -39,6 +39,7 @@
 
 #include "blk.h"
 #include "blk-mq.h"
+#include "blk-wb.h"
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
@@ -880,6 +881,7 @@ blk_init_allocated_queue(struct request_queue *q, 
request_fn_proc *rfn,
 
 fail:
blk_free_flush_queue(q->fq);
+   blk_wb_exit(q);
return NULL;
 }
 EXPORT_SYMBOL(blk_init_allocated_queue);
@@ -1395,6 +1397,7 @@ void blk_requeue_request(struct request_queue *q, struct 
request *rq)
blk_delete_timer(rq);
blk_clear_rq_complete(rq);
trace_block_rq_requeue(q, rq);
+   blk_wb_requeue(q->rq_wb, rq);
 
if (rq->cmd_flags & REQ_QUEUED)
blk_queue_end_tag(q, rq);
@@ -1485,6 +1488,8 @@ void __blk_put_request(struct request_queue *q, struct 
request *req)
/* this 

[PATCH 8/8] writeback: throttle buffered writeback

2016-03-23 Thread Jens Axboe
Test patch that throttles buffered writeback to make it a lot
more smooth, and has way less impact on other system activity.
Background writeback should be, by definition, background
activity. The fact that we flush huge bundles of it at the time
means that it potentially has heavy impacts on foreground workloads,
which isn't ideal. We can't easily limit the sizes of writes that
we do, since that would impact file system layout in the presence
of delayed allocation. So just throttle back buffered writeback,
unless someone is waiting for it.

Would likely need a dynamic adaption to the current device, this
one has only been tested on NVMe. But it brings down background
activity impact from 1-2s to tens of milliseconds instead.

This is just a test patch, and as such, it registers a queue sysfs
entry to both monitor the current state:

$ cat /sys/block/nvme0n1/queue/wb_stats
limit=4, batch=2, inflight=0, wait=0, timer=0

'limit' denotes how many requests we will allow inflight for buffered
writeback, this settings can be tweaked through writing to the
'wb_depth' file. Writing '0' turns this off completely. 'inflight' shows
how many requests are currently inflight for buffered writeback, 'wait'
shows if anyone is currently waiting for access, and 'timer' shows
if we have processes being deferred in write back cache timeout.

Background buffered writeback will be throttled at depth 'wb_depth',
and even lower (QD=1) if the device recently completed "competing" IO.
If we are doing reclaim or otherwise sync buffered writeback, the limit
is increased 4x to achieve full device bandwidth.

Finally, if the device has write back caching, 'wb_cache_delay' delays
by this amount of usecs when a write completes before allowing more.

Signed-off-by: Jens Axboe 
---
 block/Makefile|   2 +-
 block/blk-core.c  |  15 
 block/blk-mq.c|  32 ++-
 block/blk-sysfs.c |  84 ++
 block/blk-wb.c| 219 ++
 block/blk-wb.h|  27 ++
 include/linux/blk_types.h |   2 +
 include/linux/blkdev.h|   3 +
 8 files changed, 381 insertions(+), 3 deletions(-)
 create mode 100644 block/blk-wb.c
 create mode 100644 block/blk-wb.h

diff --git a/block/Makefile b/block/Makefile
index 9eda2322b2d4..9df911a3b569 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,7 +5,7 @@
 obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
-   blk-lib.o blk-mq.o blk-mq-tag.o \
+   blk-lib.o blk-mq.o blk-mq-tag.o blk-wb.o \
blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
badblocks.o partitions/
diff --git a/block/blk-core.c b/block/blk-core.c
index 827f8badd143..887a9e64c6ef 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -39,6 +39,7 @@
 
 #include "blk.h"
 #include "blk-mq.h"
+#include "blk-wb.h"
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
@@ -848,6 +849,9 @@ blk_init_allocated_queue(struct request_queue *q, 
request_fn_proc *rfn,
if (blk_init_rl(>root_rl, q, GFP_KERNEL))
goto fail;
 
+   if (blk_buffered_writeback_init(q))
+   goto fail;
+
INIT_WORK(>timeout_work, blk_timeout_work);
q->request_fn   = rfn;
q->prep_rq_fn   = NULL;
@@ -880,6 +884,7 @@ blk_init_allocated_queue(struct request_queue *q, 
request_fn_proc *rfn,
 
 fail:
blk_free_flush_queue(q->fq);
+   blk_buffered_writeback_exit(q);
return NULL;
 }
 EXPORT_SYMBOL(blk_init_allocated_queue);
@@ -1485,6 +1490,8 @@ void __blk_put_request(struct request_queue *q, struct 
request *req)
/* this is a bio leak */
WARN_ON(req->bio != NULL);
 
+   blk_buffered_writeback_done(q->rq_wb, req);
+
/*
 * Request may not have originated from ll_rw_blk. if not,
 * it didn't come out of our reserved rq pools
@@ -1714,6 +1721,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, 
struct bio *bio)
int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
struct request *req;
unsigned int request_count = 0;
+   bool wb_acct;
 
/*
 * low level driver can indicate that it wants pages above a
@@ -1766,6 +1774,8 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, 
struct bio *bio)
}
 
 get_rq:
+   wb_acct = blk_buffered_writeback_wait(q->rq_wb, bio, q->queue_lock);
+
/*
 * This sync check and mask will be re-done in init_request_from_bio(),
 * but we need to set it earlier to expose the sync flag to the
@@ -1781,11 +1791,16 @@ get_rq:
 */
req = 

[PATCH 8/8] writeback: throttle buffered writeback

2016-03-23 Thread Jens Axboe
Test patch that throttles buffered writeback to make it a lot
more smooth, and has way less impact on other system activity.
Background writeback should be, by definition, background
activity. The fact that we flush huge bundles of it at the time
means that it potentially has heavy impacts on foreground workloads,
which isn't ideal. We can't easily limit the sizes of writes that
we do, since that would impact file system layout in the presence
of delayed allocation. So just throttle back buffered writeback,
unless someone is waiting for it.

Would likely need a dynamic adaption to the current device, this
one has only been tested on NVMe. But it brings down background
activity impact from 1-2s to tens of milliseconds instead.

This is just a test patch, and as such, it registers a queue sysfs
entry to both monitor the current state:

$ cat /sys/block/nvme0n1/queue/wb_stats
limit=4, batch=2, inflight=0, wait=0, timer=0

'limit' denotes how many requests we will allow inflight for buffered
writeback, this settings can be tweaked through writing to the
'wb_depth' file. Writing '0' turns this off completely. 'inflight' shows
how many requests are currently inflight for buffered writeback, 'wait'
shows if anyone is currently waiting for access, and 'timer' shows
if we have processes being deferred in write back cache timeout.

Background buffered writeback will be throttled at depth 'wb_depth',
and even lower (QD=1) if the device recently completed "competing" IO.
If we are doing reclaim or otherwise sync buffered writeback, the limit
is increased 4x to achieve full device bandwidth.

Finally, if the device has write back caching, 'wb_cache_delay' delays
by this amount of usecs when a write completes before allowing more.

Signed-off-by: Jens Axboe 
---
 block/Makefile|   2 +-
 block/blk-core.c  |  15 
 block/blk-mq.c|  32 ++-
 block/blk-sysfs.c |  84 ++
 block/blk-wb.c| 219 ++
 block/blk-wb.h|  27 ++
 include/linux/blk_types.h |   2 +
 include/linux/blkdev.h|   3 +
 8 files changed, 381 insertions(+), 3 deletions(-)
 create mode 100644 block/blk-wb.c
 create mode 100644 block/blk-wb.h

diff --git a/block/Makefile b/block/Makefile
index 9eda2322b2d4..9df911a3b569 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,7 +5,7 @@
 obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
-   blk-lib.o blk-mq.o blk-mq-tag.o \
+   blk-lib.o blk-mq.o blk-mq-tag.o blk-wb.o \
blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
badblocks.o partitions/
diff --git a/block/blk-core.c b/block/blk-core.c
index 827f8badd143..887a9e64c6ef 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -39,6 +39,7 @@
 
 #include "blk.h"
 #include "blk-mq.h"
+#include "blk-wb.h"
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
@@ -848,6 +849,9 @@ blk_init_allocated_queue(struct request_queue *q, 
request_fn_proc *rfn,
if (blk_init_rl(>root_rl, q, GFP_KERNEL))
goto fail;
 
+   if (blk_buffered_writeback_init(q))
+   goto fail;
+
INIT_WORK(>timeout_work, blk_timeout_work);
q->request_fn   = rfn;
q->prep_rq_fn   = NULL;
@@ -880,6 +884,7 @@ blk_init_allocated_queue(struct request_queue *q, 
request_fn_proc *rfn,
 
 fail:
blk_free_flush_queue(q->fq);
+   blk_buffered_writeback_exit(q);
return NULL;
 }
 EXPORT_SYMBOL(blk_init_allocated_queue);
@@ -1485,6 +1490,8 @@ void __blk_put_request(struct request_queue *q, struct 
request *req)
/* this is a bio leak */
WARN_ON(req->bio != NULL);
 
+   blk_buffered_writeback_done(q->rq_wb, req);
+
/*
 * Request may not have originated from ll_rw_blk. if not,
 * it didn't come out of our reserved rq pools
@@ -1714,6 +1721,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, 
struct bio *bio)
int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
struct request *req;
unsigned int request_count = 0;
+   bool wb_acct;
 
/*
 * low level driver can indicate that it wants pages above a
@@ -1766,6 +1774,8 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, 
struct bio *bio)
}
 
 get_rq:
+   wb_acct = blk_buffered_writeback_wait(q->rq_wb, bio, q->queue_lock);
+
/*
 * This sync check and mask will be re-done in init_request_from_bio(),
 * but we need to set it earlier to expose the sync flag to the
@@ -1781,11 +1791,16 @@ get_rq:
 */
req = get_request(q,