[PATCH V4 04/16] block, bfq: modify the peak-rate estimator

2017-04-12 Thread Paolo Valente
Unless the maximum budget B_max that BFQ can assign to a queue is set
explicitly by the user, BFQ automatically updates B_max. In
particular, BFQ dynamically sets B_max to the number of sectors that
can be read, at the current estimated peak rate, during the maximum
time, T_max, allowed before a budget timeout occurs. In formulas, if
we denote as R_est the estimated peak rate, then B_max = T_max ∗
R_est. Hence, the higher R_est is with respect to the actual device
peak rate, the higher the probability that processes incur budget
timeouts unjustly is. Besides, a too high value of B_max unnecessarily
increases the deviation from an ideal, smooth service.

Unfortunately, it is not trivial to estimate the peak rate correctly:
because of the presence of sw and hw queues between the scheduler and
the device components that finally serve I/O requests, it is hard to
say exactly when a given dispatched request is served inside the
device, and for how long. As a consequence, it is hard to know
precisely at what rate a given set of requests is actually served by
the device.

On the opposite end, the dispatch time of any request is trivially
available, and, from this piece of information, the "dispatch rate"
of requests can be immediately computed. So, the idea in the next
function is to use what is known, namely request dispatch times
(plus, when useful, request completion times), to estimate what is
unknown, namely in-device request service rate.

The main issue is that, because of the above facts, the rate at
which a certain set of requests is dispatched over a certain time
interval can vary greatly with respect to the rate at which the
same requests are then served. But, since the size of any
intermediate queue is limited, and the service scheme is lossless
(no request is silently dropped), the following obvious convergence
property holds: the number of requests dispatched MUST become
closer and closer to the number of requests completed as the
observation interval grows. This is the key property used in
this new version of the peak-rate estimator.

Signed-off-by: Paolo Valente 
Signed-off-by: Arianna Avanzini 
---
 block/bfq-iosched.c | 497 +++-
 1 file changed, 372 insertions(+), 125 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 1edac72..61d880b 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -407,19 +407,37 @@ struct bfq_data {
/* on-disk position of the last served request */
sector_t last_position;
 
+   /* time of last request completion (ns) */
+   u64 last_completion;
+
+   /* time of first rq dispatch in current observation interval (ns) */
+   u64 first_dispatch;
+   /* time of last rq dispatch in current observation interval (ns) */
+   u64 last_dispatch;
+
/* beginning of the last budget */
ktime_t last_budget_start;
/* beginning of the last idle slice */
ktime_t last_idling_start;
-   /* number of samples used to calculate @peak_rate */
+
+   /* number of samples in current observation interval */
int peak_rate_samples;
+   /* num of samples of seq dispatches in current observation interval */
+   u32 sequential_samples;
+   /* total num of sectors transferred in current observation interval */
+   u64 tot_sectors_dispatched;
+   /* max rq size seen during current observation interval (sectors) */
+   u32 last_rq_max_size;
+   /* time elapsed from first dispatch in current observ. interval (us) */
+   u64 delta_from_first;
/*
-* Peak read/write rate, observed during the service of a
-* budget [BFQ_RATE_SHIFT * sectors/usec]. The value is
-* left-shifted by BFQ_RATE_SHIFT to increase precision in
+* Current estimate of the device peak rate, measured in
+* [BFQ_RATE_SHIFT * sectors/usec]. The left-shift by
+* BFQ_RATE_SHIFT is performed to increase precision in
 * fixed-point calculations.
 */
-   u64 peak_rate;
+   u32 peak_rate;
+
/* maximum budget allotted to a bfq_queue before rescheduling */
int bfq_max_budget;
 
@@ -740,7 +758,7 @@ static const int bfq_timeout = HZ / 8;
 
 static struct kmem_cache *bfq_pool;
 
-/* Below this threshold (in ms), we consider thinktime immediate. */
+/* Below this threshold (in ns), we consider thinktime immediate. */
 #define BFQ_MIN_TT (2 * NSEC_PER_MSEC)
 
 /* hw_tag detection: parallel requests threshold and min samples needed. */
@@ -752,8 +770,12 @@ static struct kmem_cache *bfq_pool;
 #define BFQQ_CLOSE_THR (sector_t)(8 * 1024)
 #define BFQQ_SEEKY(bfqq)   (hweight32(bfqq->seek_history) > 32/8)
 
-/* Min samples used for peak rate estimation (for autotuning). */
-#define BFQ_PEAK_RATE_SAMPLES  32
+/* Min number of samples required to perform peak-rate update */
+#define 

[PATCH V4 04/16] block, bfq: modify the peak-rate estimator

2017-04-12 Thread Paolo Valente
Unless the maximum budget B_max that BFQ can assign to a queue is set
explicitly by the user, BFQ automatically updates B_max. In
particular, BFQ dynamically sets B_max to the number of sectors that
can be read, at the current estimated peak rate, during the maximum
time, T_max, allowed before a budget timeout occurs. In formulas, if
we denote as R_est the estimated peak rate, then B_max = T_max ∗
R_est. Hence, the higher R_est is with respect to the actual device
peak rate, the higher the probability that processes incur budget
timeouts unjustly is. Besides, a too high value of B_max unnecessarily
increases the deviation from an ideal, smooth service.

Unfortunately, it is not trivial to estimate the peak rate correctly:
because of the presence of sw and hw queues between the scheduler and
the device components that finally serve I/O requests, it is hard to
say exactly when a given dispatched request is served inside the
device, and for how long. As a consequence, it is hard to know
precisely at what rate a given set of requests is actually served by
the device.

On the opposite end, the dispatch time of any request is trivially
available, and, from this piece of information, the "dispatch rate"
of requests can be immediately computed. So, the idea in the next
function is to use what is known, namely request dispatch times
(plus, when useful, request completion times), to estimate what is
unknown, namely in-device request service rate.

The main issue is that, because of the above facts, the rate at
which a certain set of requests is dispatched over a certain time
interval can vary greatly with respect to the rate at which the
same requests are then served. But, since the size of any
intermediate queue is limited, and the service scheme is lossless
(no request is silently dropped), the following obvious convergence
property holds: the number of requests dispatched MUST become
closer and closer to the number of requests completed as the
observation interval grows. This is the key property used in
this new version of the peak-rate estimator.

Signed-off-by: Paolo Valente 
Signed-off-by: Arianna Avanzini 
---
 block/bfq-iosched.c | 497 +++-
 1 file changed, 372 insertions(+), 125 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 1edac72..61d880b 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -407,19 +407,37 @@ struct bfq_data {
/* on-disk position of the last served request */
sector_t last_position;
 
+   /* time of last request completion (ns) */
+   u64 last_completion;
+
+   /* time of first rq dispatch in current observation interval (ns) */
+   u64 first_dispatch;
+   /* time of last rq dispatch in current observation interval (ns) */
+   u64 last_dispatch;
+
/* beginning of the last budget */
ktime_t last_budget_start;
/* beginning of the last idle slice */
ktime_t last_idling_start;
-   /* number of samples used to calculate @peak_rate */
+
+   /* number of samples in current observation interval */
int peak_rate_samples;
+   /* num of samples of seq dispatches in current observation interval */
+   u32 sequential_samples;
+   /* total num of sectors transferred in current observation interval */
+   u64 tot_sectors_dispatched;
+   /* max rq size seen during current observation interval (sectors) */
+   u32 last_rq_max_size;
+   /* time elapsed from first dispatch in current observ. interval (us) */
+   u64 delta_from_first;
/*
-* Peak read/write rate, observed during the service of a
-* budget [BFQ_RATE_SHIFT * sectors/usec]. The value is
-* left-shifted by BFQ_RATE_SHIFT to increase precision in
+* Current estimate of the device peak rate, measured in
+* [BFQ_RATE_SHIFT * sectors/usec]. The left-shift by
+* BFQ_RATE_SHIFT is performed to increase precision in
 * fixed-point calculations.
 */
-   u64 peak_rate;
+   u32 peak_rate;
+
/* maximum budget allotted to a bfq_queue before rescheduling */
int bfq_max_budget;
 
@@ -740,7 +758,7 @@ static const int bfq_timeout = HZ / 8;
 
 static struct kmem_cache *bfq_pool;
 
-/* Below this threshold (in ms), we consider thinktime immediate. */
+/* Below this threshold (in ns), we consider thinktime immediate. */
 #define BFQ_MIN_TT (2 * NSEC_PER_MSEC)
 
 /* hw_tag detection: parallel requests threshold and min samples needed. */
@@ -752,8 +770,12 @@ static struct kmem_cache *bfq_pool;
 #define BFQQ_CLOSE_THR (sector_t)(8 * 1024)
 #define BFQQ_SEEKY(bfqq)   (hweight32(bfqq->seek_history) > 32/8)
 
-/* Min samples used for peak rate estimation (for autotuning). */
-#define BFQ_PEAK_RATE_SAMPLES  32
+/* Min number of samples required to perform peak-rate update */
+#define BFQ_RATE_MIN_SAMPLES   32
+/* Min observation time interval