Re: [PATCH v2] blk-throttle: track read and write request individually

2017-10-22 Thread Joseph Qi
Hi Jens,
Could you please pick this patch?

Thanks,
Joseph

On 17/10/13 01:13, Shaohua Li wrote:
> On Thu, Oct 12, 2017 at 05:15:46PM +0800, Joseph Qi wrote:
>> From: Joseph Qi 
>>
>> In mixed read/write workload on SSD, write latency is much lower than
>> read. But now we only track and record read latency and then use it as
>> threshold base for both read and write io latency accounting. As a
>> result, write io latency will always be considered as good and
>> bad_bio_cnt is much smaller than 20% of bio_cnt. That is to mean, the
>> tg to be checked will be treated as idle most of the time and still let
>> others dispatch more ios, even it is truly running under low limit and
>> wants its low limit to be guaranteed, which is not we expected in fact.
>> So track read and write request individually, which can bring more
>> precise latency control for low limit idle detection.
>>
>> Signed-off-by: Joseph Qi 
> 
> Reviewed-by: Shaohua Li 
>> ---
>>  block/blk-throttle.c | 134 
>> ++-
>>  1 file changed, 79 insertions(+), 55 deletions(-)
>>
>> diff --git a/block/blk-throttle.c b/block/blk-throttle.c
>> index 17816a0..0897378 100644
>> --- a/block/blk-throttle.c
>> +++ b/block/blk-throttle.c
>> @@ -215,9 +215,9 @@ struct throtl_data
>>  
>>  unsigned int scale;
>>  
>> -struct latency_bucket tmp_buckets[LATENCY_BUCKET_SIZE];
>> -struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE];
>> -struct latency_bucket __percpu *latency_buckets;
>> +struct latency_bucket tmp_buckets[2][LATENCY_BUCKET_SIZE];
>> +struct avg_latency_bucket avg_buckets[2][LATENCY_BUCKET_SIZE];
>> +struct latency_bucket __percpu *latency_buckets[2];
>>  unsigned long last_calculate_time;
>>  unsigned long filtered_latency;
>>  
>> @@ -2040,10 +2040,10 @@ static void blk_throtl_update_idletime(struct 
>> throtl_grp *tg)
>>  #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
>>  static void throtl_update_latency_buckets(struct throtl_data *td)
>>  {
>> -struct avg_latency_bucket avg_latency[LATENCY_BUCKET_SIZE];
>> -int i, cpu;
>> -unsigned long last_latency = 0;
>> -unsigned long latency;
>> +struct avg_latency_bucket avg_latency[2][LATENCY_BUCKET_SIZE];
>> +int i, cpu, rw;
>> +unsigned long last_latency[2] = { 0 };
>> +unsigned long latency[2];
>>  
>>  if (!blk_queue_nonrot(td->queue))
>>  return;
>> @@ -2052,56 +2052,67 @@ static void throtl_update_latency_buckets(struct 
>> throtl_data *td)
>>  td->last_calculate_time = jiffies;
>>  
>>  memset(avg_latency, 0, sizeof(avg_latency));
>> -for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
>> -struct latency_bucket *tmp = >tmp_buckets[i];
>> -
>> -for_each_possible_cpu(cpu) {
>> -struct latency_bucket *bucket;
>> -
>> -/* this isn't race free, but ok in practice */
>> -bucket = per_cpu_ptr(td->latency_buckets, cpu);
>> -tmp->total_latency += bucket[i].total_latency;
>> -tmp->samples += bucket[i].samples;
>> -bucket[i].total_latency = 0;
>> -bucket[i].samples = 0;
>> -}
>> +for (rw = READ; rw <= WRITE; rw++) {
>> +for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
>> +struct latency_bucket *tmp = >tmp_buckets[rw][i];
>> +
>> +for_each_possible_cpu(cpu) {
>> +struct latency_bucket *bucket;
>> +
>> +/* this isn't race free, but ok in practice */
>> +bucket = per_cpu_ptr(td->latency_buckets[rw],
>> +cpu);
>> +tmp->total_latency += bucket[i].total_latency;
>> +tmp->samples += bucket[i].samples;
>> +bucket[i].total_latency = 0;
>> +bucket[i].samples = 0;
>> +}
>>  
>> -if (tmp->samples >= 32) {
>> -int samples = tmp->samples;
>> +if (tmp->samples >= 32) {
>> +int samples = tmp->samples;
>>  
>> -latency = tmp->total_latency;
>> +latency[rw] = tmp->total_latency;
>>  
>> -tmp->total_latency = 0;
>> -tmp->samples = 0;
>> -latency /= samples;
>> -if (latency == 0)
>> -continue;
>> -avg_latency[i].latency = latency;
>> +tmp->total_latency = 0;
>> +tmp->samples = 0;
>> +latency[rw] /= samples;
>> +if (latency[rw] == 0)
>> +continue;
>> +   

Re: [PATCH v2] blk-throttle: track read and write request individually

2017-10-12 Thread Shaohua Li
On Thu, Oct 12, 2017 at 05:15:46PM +0800, Joseph Qi wrote:
> From: Joseph Qi 
> 
> In mixed read/write workload on SSD, write latency is much lower than
> read. But now we only track and record read latency and then use it as
> threshold base for both read and write io latency accounting. As a
> result, write io latency will always be considered as good and
> bad_bio_cnt is much smaller than 20% of bio_cnt. That is to mean, the
> tg to be checked will be treated as idle most of the time and still let
> others dispatch more ios, even it is truly running under low limit and
> wants its low limit to be guaranteed, which is not we expected in fact.
> So track read and write request individually, which can bring more
> precise latency control for low limit idle detection.
> 
> Signed-off-by: Joseph Qi 

Reviewed-by: Shaohua Li 
> ---
>  block/blk-throttle.c | 134 
> ++-
>  1 file changed, 79 insertions(+), 55 deletions(-)
> 
> diff --git a/block/blk-throttle.c b/block/blk-throttle.c
> index 17816a0..0897378 100644
> --- a/block/blk-throttle.c
> +++ b/block/blk-throttle.c
> @@ -215,9 +215,9 @@ struct throtl_data
>  
>   unsigned int scale;
>  
> - struct latency_bucket tmp_buckets[LATENCY_BUCKET_SIZE];
> - struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE];
> - struct latency_bucket __percpu *latency_buckets;
> + struct latency_bucket tmp_buckets[2][LATENCY_BUCKET_SIZE];
> + struct avg_latency_bucket avg_buckets[2][LATENCY_BUCKET_SIZE];
> + struct latency_bucket __percpu *latency_buckets[2];
>   unsigned long last_calculate_time;
>   unsigned long filtered_latency;
>  
> @@ -2040,10 +2040,10 @@ static void blk_throtl_update_idletime(struct 
> throtl_grp *tg)
>  #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
>  static void throtl_update_latency_buckets(struct throtl_data *td)
>  {
> - struct avg_latency_bucket avg_latency[LATENCY_BUCKET_SIZE];
> - int i, cpu;
> - unsigned long last_latency = 0;
> - unsigned long latency;
> + struct avg_latency_bucket avg_latency[2][LATENCY_BUCKET_SIZE];
> + int i, cpu, rw;
> + unsigned long last_latency[2] = { 0 };
> + unsigned long latency[2];
>  
>   if (!blk_queue_nonrot(td->queue))
>   return;
> @@ -2052,56 +2052,67 @@ static void throtl_update_latency_buckets(struct 
> throtl_data *td)
>   td->last_calculate_time = jiffies;
>  
>   memset(avg_latency, 0, sizeof(avg_latency));
> - for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
> - struct latency_bucket *tmp = >tmp_buckets[i];
> -
> - for_each_possible_cpu(cpu) {
> - struct latency_bucket *bucket;
> -
> - /* this isn't race free, but ok in practice */
> - bucket = per_cpu_ptr(td->latency_buckets, cpu);
> - tmp->total_latency += bucket[i].total_latency;
> - tmp->samples += bucket[i].samples;
> - bucket[i].total_latency = 0;
> - bucket[i].samples = 0;
> - }
> + for (rw = READ; rw <= WRITE; rw++) {
> + for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
> + struct latency_bucket *tmp = >tmp_buckets[rw][i];
> +
> + for_each_possible_cpu(cpu) {
> + struct latency_bucket *bucket;
> +
> + /* this isn't race free, but ok in practice */
> + bucket = per_cpu_ptr(td->latency_buckets[rw],
> + cpu);
> + tmp->total_latency += bucket[i].total_latency;
> + tmp->samples += bucket[i].samples;
> + bucket[i].total_latency = 0;
> + bucket[i].samples = 0;
> + }
>  
> - if (tmp->samples >= 32) {
> - int samples = tmp->samples;
> + if (tmp->samples >= 32) {
> + int samples = tmp->samples;
>  
> - latency = tmp->total_latency;
> + latency[rw] = tmp->total_latency;
>  
> - tmp->total_latency = 0;
> - tmp->samples = 0;
> - latency /= samples;
> - if (latency == 0)
> - continue;
> - avg_latency[i].latency = latency;
> + tmp->total_latency = 0;
> + tmp->samples = 0;
> + latency[rw] /= samples;
> + if (latency[rw] == 0)
> + continue;
> + avg_latency[rw][i].latency = latency[rw];
> + }
>   }
>   }
>  
> - for (i = 0; i < 

[PATCH v2] blk-throttle: track read and write request individually

2017-10-12 Thread Joseph Qi
From: Joseph Qi 

In mixed read/write workload on SSD, write latency is much lower than
read. But now we only track and record read latency and then use it as
threshold base for both read and write io latency accounting. As a
result, write io latency will always be considered as good and
bad_bio_cnt is much smaller than 20% of bio_cnt. That is to mean, the
tg to be checked will be treated as idle most of the time and still let
others dispatch more ios, even it is truly running under low limit and
wants its low limit to be guaranteed, which is not we expected in fact.
So track read and write request individually, which can bring more
precise latency control for low limit idle detection.

Signed-off-by: Joseph Qi 
---
 block/blk-throttle.c | 134 ++-
 1 file changed, 79 insertions(+), 55 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 17816a0..0897378 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -215,9 +215,9 @@ struct throtl_data
 
unsigned int scale;
 
-   struct latency_bucket tmp_buckets[LATENCY_BUCKET_SIZE];
-   struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE];
-   struct latency_bucket __percpu *latency_buckets;
+   struct latency_bucket tmp_buckets[2][LATENCY_BUCKET_SIZE];
+   struct avg_latency_bucket avg_buckets[2][LATENCY_BUCKET_SIZE];
+   struct latency_bucket __percpu *latency_buckets[2];
unsigned long last_calculate_time;
unsigned long filtered_latency;
 
@@ -2040,10 +2040,10 @@ static void blk_throtl_update_idletime(struct 
throtl_grp *tg)
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
 static void throtl_update_latency_buckets(struct throtl_data *td)
 {
-   struct avg_latency_bucket avg_latency[LATENCY_BUCKET_SIZE];
-   int i, cpu;
-   unsigned long last_latency = 0;
-   unsigned long latency;
+   struct avg_latency_bucket avg_latency[2][LATENCY_BUCKET_SIZE];
+   int i, cpu, rw;
+   unsigned long last_latency[2] = { 0 };
+   unsigned long latency[2];
 
if (!blk_queue_nonrot(td->queue))
return;
@@ -2052,56 +2052,67 @@ static void throtl_update_latency_buckets(struct 
throtl_data *td)
td->last_calculate_time = jiffies;
 
memset(avg_latency, 0, sizeof(avg_latency));
-   for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
-   struct latency_bucket *tmp = >tmp_buckets[i];
-
-   for_each_possible_cpu(cpu) {
-   struct latency_bucket *bucket;
-
-   /* this isn't race free, but ok in practice */
-   bucket = per_cpu_ptr(td->latency_buckets, cpu);
-   tmp->total_latency += bucket[i].total_latency;
-   tmp->samples += bucket[i].samples;
-   bucket[i].total_latency = 0;
-   bucket[i].samples = 0;
-   }
+   for (rw = READ; rw <= WRITE; rw++) {
+   for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+   struct latency_bucket *tmp = >tmp_buckets[rw][i];
+
+   for_each_possible_cpu(cpu) {
+   struct latency_bucket *bucket;
+
+   /* this isn't race free, but ok in practice */
+   bucket = per_cpu_ptr(td->latency_buckets[rw],
+   cpu);
+   tmp->total_latency += bucket[i].total_latency;
+   tmp->samples += bucket[i].samples;
+   bucket[i].total_latency = 0;
+   bucket[i].samples = 0;
+   }
 
-   if (tmp->samples >= 32) {
-   int samples = tmp->samples;
+   if (tmp->samples >= 32) {
+   int samples = tmp->samples;
 
-   latency = tmp->total_latency;
+   latency[rw] = tmp->total_latency;
 
-   tmp->total_latency = 0;
-   tmp->samples = 0;
-   latency /= samples;
-   if (latency == 0)
-   continue;
-   avg_latency[i].latency = latency;
+   tmp->total_latency = 0;
+   tmp->samples = 0;
+   latency[rw] /= samples;
+   if (latency[rw] == 0)
+   continue;
+   avg_latency[rw][i].latency = latency[rw];
+   }
}
}
 
-   for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
-   if (!avg_latency[i].latency) {
-   if (td->avg_buckets[i].latency < last_latency)
-